Ejemplo n.º 1
0
/* write select job credential to a string
 * IN jobinfo - a select job credential
 * OUT buf    - location to write job credential contents
 * IN size    - byte size of buf
 * IN mode    - print mode, see enum select_print_mode
 * RET        - the string, same as buf
 */
extern char *select_g_select_jobinfo_sprint(dynamic_plugin_data_t *jobinfo,
					    char *buf, size_t size, int mode)
{
	void *data = NULL;
	uint32_t plugin_id;

	if (slurm_select_init(0) < 0)
		return NULL;
	if (jobinfo) {
		data = jobinfo->data;
		plugin_id = jobinfo->plugin_id;
	} else
		plugin_id = select_context_default;

	return (*(ops[plugin_id].
		  jobinfo_sprint))
		(data, buf, size, mode);
}
Ejemplo n.º 2
0
/* get data from a select job credential
 * IN jobinfo  - updated select job credential
 * IN data_type - type of data to enter into job credential
 * IN/OUT data - the data to enter into job credential
 */
extern int select_g_select_jobinfo_get(dynamic_plugin_data_t *jobinfo,
				       enum select_jobdata_type data_type,
				       void *data)
{
	void *jobdata = NULL;
	uint32_t plugin_id;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;

	if (jobinfo) {
		jobdata = jobinfo->data;
		plugin_id = jobinfo->plugin_id;
	} else
		plugin_id = select_context_default;

	return (*(ops[plugin_id].jobinfo_get))
		(jobdata, data_type, data);
}
Ejemplo n.º 3
0
/* If the slurmctld is running a linear based select plugin return 1
 * else 0. */
extern int select_running_linear_based(void)
{
	int rc = 0;

	if (slurm_select_init(0) < 0)
		return 0;

	switch (*(ops[select_context_default].plugin_id)) {
	case 102: // select/linear
	case 104: // select/alps -> linear
	case 107: // select/cray -> linear
		rc = 1;
		break;
	default:
		rc = 0;
		break;
	}
	return rc;
}
Ejemplo n.º 4
0
extern int select_g_select_nodeinfo_get(dynamic_plugin_data_t *nodeinfo,
					enum select_nodedata_type dinfo,
					enum node_states state,
					void *data)
{
	void *nodedata = NULL;
	uint32_t plugin_id;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;

	if (nodeinfo) {
		nodedata = nodeinfo->data;
		plugin_id = nodeinfo->plugin_id;
	} else
		plugin_id = select_context_default;

	return (*(ops[plugin_id].nodeinfo_get))
		(nodedata, dinfo, state, data);
}
Ejemplo n.º 5
0
/* unpack a select job credential from a buffer
 * OUT jobinfo - the select job credential read
 * IN  buffer  - buffer with select credential read from current pointer loc
 * RET         - slurm error code
 * NOTE: returned value must be freed using select_g_free_jobinfo
 */
extern int select_g_select_jobinfo_unpack(dynamic_plugin_data_t **jobinfo,
					  Buf buffer,
					  uint16_t protocol_version)
{
	dynamic_plugin_data_t *jobinfo_ptr = NULL;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;

	jobinfo_ptr = xmalloc(sizeof(dynamic_plugin_data_t));
	*jobinfo = jobinfo_ptr;

	if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
		int i;
		uint32_t plugin_id;
		safe_unpack32(&plugin_id, buffer);
		for (i=0; i<select_context_cnt; i++)
			if(*(select_context[i].ops.plugin_id) == plugin_id) {
				jobinfo_ptr->plugin_id = i;
				break;
			}
		if (i >= select_context_cnt) {
			error("we don't have this plugin type %u", plugin_id);
			goto unpack_error;
		}
	} else
		jobinfo_ptr->plugin_id = select_context_default;

	if ((*(select_context[jobinfo_ptr->plugin_id].ops.jobinfo_unpack))
		((select_jobinfo_t **)&jobinfo_ptr->data, buffer,
		 protocol_version) != SLURM_SUCCESS)
		goto unpack_error;

	return SLURM_SUCCESS;

unpack_error:
	select_g_select_jobinfo_free(jobinfo_ptr);
	*jobinfo = NULL;
	error("select_g_select_jobinfo_unpack: unpack error");
	return SLURM_ERROR;
}
Ejemplo n.º 6
0
/* pack a select job credential into a buffer in machine independent form
 * IN jobinfo  - the select job credential to be saved
 * OUT buffer  - buffer with select credential appended
 * RET         - slurm error code
 */
extern int select_g_select_jobinfo_pack(dynamic_plugin_data_t *jobinfo,
					Buf buffer,
					uint16_t protocol_version)
{
	void *data = NULL;
	uint32_t plugin_id;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;

	if(jobinfo) {
		data = jobinfo->data;
		plugin_id = jobinfo->plugin_id;
	} else
		plugin_id = select_context_default;

	if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION)
		pack32(*(select_context[plugin_id].ops.plugin_id),
		       buffer);
	return (*(select_context[plugin_id].ops.
		  jobinfo_pack))(data, buffer, protocol_version);
}
Ejemplo n.º 7
0
int srun(int ac, char **av)
{
	int debug_level;
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	bool got_alloc = false;
	List srun_job_list = NULL;

	slurm_conf_init(NULL);
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(av[0]), logopt, 0, NULL);
	_set_exit_code();

	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );

	if (switch_init(0) != SLURM_SUCCESS )
		fatal("failed to initialize switch plugins");

	_setup_env_working_cluster();

	init_srun(ac, av, &logopt, debug_level, 1);
	if (opt_list) {
		if (!_enable_pack_steps())
			fatal("Job steps that span multiple components of a heterogeneous job are not currently supported");
		create_srun_job((void **) &srun_job_list, &got_alloc, 0, 1);
	} else
		create_srun_job((void **) &job, &got_alloc, 0, 1);

	_setup_job_env(job, srun_job_list, got_alloc);
	_set_node_alias();
	_launch_app(job, srun_job_list, got_alloc);

	if ((global_rc & 0xff) == SIG_OOM)
		global_rc = 1;	/* Exit code 1 */

	return (int)global_rc;
}
Ejemplo n.º 8
0
static int
_slurmd_init(void)
{
	struct rlimit rlim;
	slurm_ctl_conf_t *cf;
	struct stat stat_buf;
	uint32_t cpu_cnt;

	/*
	 * Process commandline arguments first, since one option may be
	 * an alternate location for the slurm config file.
	 */
	_process_cmdline(*conf->argc, *conf->argv);

	/*
	 * Build nodes table like in slurmctld
	 * This is required by the topology stack
	 * Node tables setup must preceed _read_config() so that the
	 * proper hostname is set.
	 */
	slurm_conf_init(conf->conffile);
	init_node_conf();
	/* slurm_select_init() must be called before
	 * build_all_nodeline_info() to be called with proper argument. */
	if (slurm_select_init(1) != SLURM_SUCCESS )
		return SLURM_FAILURE;
	build_all_nodeline_info(true);
	build_all_frontend_info(true);

	/*
	 * Read global slurm config file, override necessary values from
	 * defaults and command line.
	 */
	_read_config();

	cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size);

	if ((gres_plugin_init() != SLURM_SUCCESS) ||
	    (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS))
		return SLURM_FAILURE;
	if (slurm_topo_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;

	/*
	 * Get and set slurmd topology information
	 * Build node hash table first to speed up the topo build
	 */
	rehash_node();
	slurm_topo_build_config();
	_set_topo_info();

	/*
	 * Check for cpu frequency set capabilities on this node
	 */
	cpu_freq_init(conf);

	_print_conf();

	if (slurm_proctrack_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (slurmd_task_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (slurm_auth_init(NULL) != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (spank_slurmd_init() < 0)
		return SLURM_FAILURE;

	if (getrlimit(RLIMIT_CPU, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_CPU, &rlim);
		if (rlim.rlim_max != RLIM_INFINITY) {
			error("Slurmd process CPU time limit is %d seconds",
			      (int) rlim.rlim_max);
		}
	}

	if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_NOFILE, &rlim);
	}
#ifndef NDEBUG
	if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_CORE, &rlim);
	}
#endif /* !NDEBUG */

	/*
	 * Create a context for verifying slurm job credentials
	 */
	if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey)))
		return SLURM_FAILURE;
	if (!strcmp(conf->select_type, "select/serial")) {
		/* Only cache credential for 5 seconds with select/serial
		 * for shorter cache searches and higher throughput */
		slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5);
	}

	/*
	 * Create slurmd spool directory if necessary.
	 */
	if (_set_slurmd_spooldir() < 0) {
		error("Unable to initialize slurmd spooldir");
		return SLURM_FAILURE;
	}

	if (conf->cleanstart) {
		/*
		 * Need to kill any running slurmd's here
		 */
		_kill_old_slurmd();

		stepd_cleanup_sockets(conf->spooldir, conf->node_name);
		_stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name);
	}

	if (conf->daemonize) {
		bool success = false;

		if (conf->logfile && (conf->logfile[0] == '/')) {
			char *slash_ptr, *work_dir;
			work_dir = xstrdup(conf->logfile);
			slash_ptr = strrchr(work_dir, '/');
			if (slash_ptr == work_dir)
				work_dir[1] = '\0';
			else
				slash_ptr[0] = '\0';
			if ((access(work_dir, W_OK) != 0) ||
			    (chdir(work_dir) < 0)) {
				error("Unable to chdir to %s", work_dir);
			} else
				success = true;
			xfree(work_dir);
		}

		if (!success) {
			if ((access(conf->spooldir, W_OK) != 0) ||
			    (chdir(conf->spooldir) < 0)) {
				error("Unable to chdir to %s", conf->spooldir);
			} else
				success = true;
		}

		if (!success) {
			if ((access("/var/tmp", W_OK) != 0) ||
			    (chdir("/var/tmp") < 0)) {
				error("chdir(/var/tmp): %m");
				return SLURM_FAILURE;
			} else
				info("chdir to /var/tmp");
		}
	}

	/*
	 * Cache the group access list
	 */
	cf = slurm_conf_lock();
	if (cf->group_info & GROUP_CACHE)
		init_gids_cache(1);
	else
		init_gids_cache(0);
	slurm_conf_unlock();

	if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) {
		error("Unable to open /dev/null: %m");
		return SLURM_FAILURE;
	}

	/* make sure we have slurmstepd installed */
	if (stat(conf->stepd_loc, &stat_buf))
		fatal("Unable to find slurmstepd file at %s", conf->stepd_loc);
	if (!S_ISREG(stat_buf.st_mode))
		fatal("slurmstepd not a file at %s", conf->stepd_loc);

	return SLURM_SUCCESS;
}
Ejemplo n.º 9
0
Archivo: srun.c Proyecto: edsw/slurm
int srun(int ac, char **av)
{
	int debug_level;
	env_t *env = xmalloc(sizeof(env_t));
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	bool got_alloc = false;
	slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
	slurm_step_launch_callbacks_t step_callbacks;

	env->stepid = -1;
	env->procid = -1;
	env->localid = -1;
	env->nodeid = -1;
	env->cli = NULL;
	env->env = NULL;
	env->ckpt_dir = NULL;

	slurm_conf_init(NULL);
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(av[0]), logopt, 0, NULL);
	_set_exit_code();

	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );

	if (switch_init() != SLURM_SUCCESS )
		fatal("failed to initialize switch plugin");

	init_srun(ac, av, &logopt, debug_level, 1);
	create_srun_job(&job, &got_alloc, 0, 1);

	/*
	 *  Enhance environment for job
	 */
	if (opt.bcast_flag)
		_file_bcast();
	if (opt.cpus_set)
		env->cpus_per_task = opt.cpus_per_task;
	if (opt.ntasks_per_node != NO_VAL)
		env->ntasks_per_node = opt.ntasks_per_node;
	if (opt.ntasks_per_socket != NO_VAL)
		env->ntasks_per_socket = opt.ntasks_per_socket;
	if (opt.ntasks_per_core != NO_VAL)
		env->ntasks_per_core = opt.ntasks_per_core;
	env->distribution = opt.distribution;
	if (opt.plane_size != NO_VAL)
		env->plane_size = opt.plane_size;
	env->cpu_bind_type = opt.cpu_bind_type;
	env->cpu_bind = opt.cpu_bind;

	env->cpu_freq_min = opt.cpu_freq_min;
	env->cpu_freq_max = opt.cpu_freq_max;
	env->cpu_freq_gov = opt.cpu_freq_gov;
	env->mem_bind_type = opt.mem_bind_type;
	env->mem_bind = opt.mem_bind;
	env->overcommit = opt.overcommit;
	env->slurmd_debug = opt.slurmd_debug;
	env->labelio = opt.labelio;
	env->comm_port = slurmctld_comm_addr.port;
	env->batch_flag = 0;
	if (opt.job_name)
		env->job_name = opt.job_name;
	if (job) {
		uint16_t *tasks = NULL;
		slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS,
				   &tasks);

		env->select_jobinfo = job->select_jobinfo;
		env->nodelist = job->nodelist;
		env->partition = job->partition;
		/* If we didn't get the allocation don't overwrite the
		 * previous info.
		 */
		if (got_alloc)
			env->nhosts = job->nhosts;
		env->ntasks = job->ntasks;
		env->task_count = _uint16_array_to_str(job->nhosts, tasks);
		env->jobid = job->jobid;
		env->stepid = job->stepid;
		env->account = job->account;
		env->qos = job->qos;
		env->resv_name = job->resv_name;
	}
	if (opt.pty && (set_winsize(job) < 0)) {
		error("Not using a pseudo-terminal, disregarding --pty option");
		opt.pty = false;
	}
	if (opt.pty) {
		struct termios term;
		int fd = STDIN_FILENO;

		/* Save terminal settings for restore */
		tcgetattr(fd, &termdefaults);
		tcgetattr(fd, &term);
		/* Set raw mode on local tty */
		cfmakeraw(&term);
		/* Re-enable output processing such that debug() and
		 * and error() work properly. */
		term.c_oflag |= OPOST;
		tcsetattr(fd, TCSANOW, &term);
		atexit(&_pty_restore);

		block_sigwinch();
		pty_thread_create(job);
		env->pty_port = job->pty_port;
		env->ws_col   = job->ws_col;
		env->ws_row   = job->ws_row;
	}
	setup_env(env, opt.preserve_env);
	xfree(env->task_count);
	xfree(env);
	_set_node_alias();

	memset(&step_callbacks, 0, sizeof(step_callbacks));
	step_callbacks.step_signal   = launch_g_fwd_signal;

	/* re_launch: */
relaunch:
	pre_launch_srun_job(job, 0, 1);

	launch_common_set_stdio_fds(job, &cio_fds);

	if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) {
		if (launch_g_step_wait(job, got_alloc) == -1)
			goto relaunch;
	}

	fini_srun(job, got_alloc, &global_rc, 0);

	return (int)global_rc;
}
Ejemplo n.º 10
0
int
main (int argc, char *argv[])
{
	slurm_addr_t *cli;
	slurm_addr_t *self;
	slurm_msg_t *msg;
	slurmd_job_t *job;
	int ngids;
	gid_t *gids;
	int rc = 0;

	if (process_cmdline (argc, argv) < 0)
		fatal ("Error in slurmstepd command line");

	xsignal_block(slurmstepd_blocked_signals);
	conf = xmalloc(sizeof(*conf));
	conf->argv = &argv;
	conf->argc = &argc;
	init_setproctitle(argc, argv);
	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );

	/* Receive job parameters from the slurmd */
	_init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg,
			  &ngids, &gids);

	/* Fancy way of closing stdin that keeps STDIN_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDIN_FILENO);

	/* Create the slurmd_job_t, mostly from info in a
	   launch_tasks_request_msg_t or a batch_job_launch_msg_t */
	if(!(job = _step_setup(cli, self, msg))) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}
	job->ngids = ngids;
	job->gids = gids;

	/* fork handlers cause mutexes on some global data structures
	   to be re-initialized after the fork. */
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/* sets job->msg_handle and job->msgid */
	if (msg_thr_create(job) == SLURM_ERROR) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}

	_send_ok_to_slurmd(STDOUT_FILENO);

	/* Fancy way of closing stdout that keeps STDOUT_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDOUT_FILENO);

	/* This does most of the stdio setup, then launches all the tasks,
	   and blocks until the step is complete */
	rc = job_manager(job);

	/* signal the message thread to shutdown, and wait for it */
	eio_signal_shutdown(job->msg_handle);
	pthread_join(job->msgid, NULL);

	if (job->batch)
		batch_finish(job, rc); /* sends batch complete message */

ending:
#ifdef MEMORY_LEAK_DEBUG
	_step_cleanup(job, msg, rc);

	xfree(cli);
	xfree(self);
	xfree(conf->hostname);
	xfree(conf->block_map);
	xfree(conf->block_map_inv);
	xfree(conf->spooldir);
	xfree(conf->node_name);
	xfree(conf->node_topo_addr);
	xfree(conf->node_topo_pattern);
	xfree(conf->logfile);
	xfree(conf);
#endif
	info("done with job");
	return rc;
}
Ejemplo n.º 11
0
extern int select_g_select_nodeinfo_unpack(dynamic_plugin_data_t **nodeinfo,
					   Buf buffer,
					   uint16_t protocol_version)
{
	dynamic_plugin_data_t *nodeinfo_ptr = NULL;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;

	nodeinfo_ptr = xmalloc(sizeof(dynamic_plugin_data_t));
	*nodeinfo = nodeinfo_ptr;

	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		int i;
		uint32_t plugin_id;
		safe_unpack32(&plugin_id, buffer);
		for (i = 0; i < select_context_cnt; i++) {
			if (*(ops[i].plugin_id) == plugin_id) {
				nodeinfo_ptr->plugin_id = i;
				break;
			}
		}
		if (i >= select_context_cnt) {
			/*
			 * cons_res and cons_tres have equivalent pack logic,
			 * so we can switch without a cold-start
			 */
			bool retry = false;
			if (plugin_id == SELECT_PLUGIN_CONS_RES) {
				plugin_id = SELECT_PLUGIN_CONS_TRES;
				retry = true;
			} else if (plugin_id == SELECT_PLUGIN_CONS_TRES) {
				plugin_id = SELECT_PLUGIN_CONS_RES;
				retry = true;
			} else if (plugin_id == SELECT_PLUGIN_CRAY_CONS_RES) {
				plugin_id = SELECT_PLUGIN_CRAY_CONS_TRES;
				retry = true;
			} else if (plugin_id == SELECT_PLUGIN_CRAY_CONS_TRES) {
				plugin_id = SELECT_PLUGIN_CRAY_CONS_RES;
				retry = true;
			}
			if (retry) {
				for (i = 0; i < select_context_cnt; i++) {
					if (*(ops[i].plugin_id) == plugin_id) {
						nodeinfo_ptr->plugin_id = i;
						break;
					}
				}
			}
		}
		if (i >= select_context_cnt) {
			error("%s: select plugin %s not found", __func__,
			      _plugin_id2name(plugin_id));
			goto unpack_error;
		}
	} else {
		nodeinfo_ptr->plugin_id = select_context_default;
		error("%s: protocol_version %hu not supported", __func__,
		      protocol_version);
		goto unpack_error;
	}

	if ((*(ops[nodeinfo_ptr->plugin_id].nodeinfo_unpack))
	   ((select_nodeinfo_t **)&nodeinfo_ptr->data, buffer,
	    protocol_version) != SLURM_SUCCESS)
		goto unpack_error;

	return SLURM_SUCCESS;

unpack_error:
	select_g_select_nodeinfo_free(nodeinfo_ptr);
	*nodeinfo = NULL;
	error("%s: unpack error", __func__);
	return SLURM_ERROR;
}
Ejemplo n.º 12
0
/* Get this plugin's sequence number in Slurm's internal tables */
extern int select_get_plugin_id_pos(uint32_t plugin_id)
{
	int i;
	static bool cray_other_cons_res = false;

	if (slurm_select_init(0) < 0)
		return SLURM_ERROR;
again:
	for (i = 0; i < select_context_cnt; i++) {
		if (*(ops[i].plugin_id) == plugin_id)
			break;
	}
	if (i >= select_context_cnt) {
		/*
		 * Put on the extra Cray select plugins that do not get
		 * generated automatically.
		 */
		if (!cray_other_cons_res &&
		    ((plugin_id == SELECT_PLUGIN_CRAY_CONS_RES)  ||
		     (plugin_id == SELECT_PLUGIN_CRAY_CONS_TRES) ||
		     (plugin_id == SELECT_PLUGIN_CRAY_LINEAR))) {
			char *type = "select", *name = "select/cray";
			uint16_t save_params = slurm_get_select_type_param();
			uint16_t params[2];
			int cray_plugin_id[2], cray_offset;

			cray_other_cons_res = true;

			if (plugin_id == SELECT_PLUGIN_CRAY_LINEAR) {
				params[0] = save_params & ~CR_OTHER_CONS_RES;
				cray_plugin_id[0] = SELECT_PLUGIN_CRAY_CONS_RES;
				params[1] = save_params & ~CR_OTHER_CONS_TRES;
				cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_TRES;
			} else if (plugin_id == SELECT_PLUGIN_CRAY_CONS_RES) {
				params[0] = save_params | CR_OTHER_CONS_RES;
				cray_plugin_id[0] = SELECT_PLUGIN_CRAY_LINEAR;
				params[1] = save_params & ~CR_OTHER_CONS_RES;
				cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_TRES;
			} else {	/* SELECT_PLUGIN_CRAY_CONS_TRES */
				params[0] = save_params | CR_OTHER_CONS_TRES;
				cray_plugin_id[0] = SELECT_PLUGIN_CRAY_LINEAR;
				params[1] = save_params & ~CR_OTHER_CONS_RES;
				cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_RES;
			}

			for (cray_offset = 0; cray_offset < 2; cray_offset++) {
				for (i = 0; i < select_context_cnt; i++) {
					if (*(ops[i].plugin_id) ==
					    cray_plugin_id[cray_offset])
						break;
				}
				if (i < select_context_cnt)
					break;	/* Found match */
			}
			if (i >= select_context_cnt)
				goto end_it;	/* No match */

			slurm_mutex_lock(&select_context_lock);
			slurm_set_select_type_param(params[cray_offset]);
			plugin_context_destroy(select_context[i]);
			select_context[i] =
				plugin_context_create(type, name,
						      (void **)&ops[i],
						      node_select_syms,
						      sizeof(node_select_syms));
			slurm_set_select_type_param(save_params);
			slurm_mutex_unlock(&select_context_lock);
			goto again;
		}

	end_it:
		return SLURM_ERROR;
	}
	return i;
}
Ejemplo n.º 13
0
int
main (int argc, char **argv)
{
	slurm_addr_t *cli;
	slurm_addr_t *self;
	slurm_msg_t *msg;
	stepd_step_rec_t *job;
	int ngids;
	gid_t *gids;
	int rc = 0;
	char *launch_params;

	if (_process_cmdline (argc, argv) < 0)
		fatal ("Error in slurmstepd command line");

	xsignal_block(slurmstepd_blocked_signals);
	conf = xmalloc(sizeof(*conf));
	conf->argv = &argv;
	conf->argc = &argc;
	init_setproctitle(argc, argv);
	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );
	if (slurm_auth_init(NULL) != SLURM_SUCCESS)
		fatal( "failed to initialize authentication plugin" );

	/* Receive job parameters from the slurmd */
	_init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg,
			  &ngids, &gids);

	/* Create the stepd_step_rec_t, mostly from info in a
	 * launch_tasks_request_msg_t or a batch_job_launch_msg_t */
	if (!(job = _step_setup(cli, self, msg))) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}
	job->ngids = ngids;
	job->gids = gids;

	/* fork handlers cause mutexes on some global data structures
	 * to be re-initialized after the fork. */
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/* sets job->msg_handle and job->msgid */
	if (msg_thr_create(job) == SLURM_ERROR) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}

	_send_ok_to_slurmd(STDOUT_FILENO);
	_got_ack_from_slurmd(STDIN_FILENO);

	/* Fancy way of closing stdin that keeps STDIN_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDIN_FILENO);

	/* Fancy way of closing stdout that keeps STDOUT_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDOUT_FILENO);

	/* slurmstepd is the only daemon that should survive upgrade. If it
	 * had been swapped out before upgrade happened it could easily lead
	 * to SIGBUS at any time after upgrade. Avoid that by locking it
	 * in-memory. */
	launch_params = slurm_get_launch_params();
	if (launch_params && strstr(launch_params, "slurmstepd_memlock")) {
#ifdef _POSIX_MEMLOCK
		int flags = MCL_CURRENT;
		if (strstr(launch_params, "slurmstepd_memlock_all"))
			flags |= MCL_FUTURE;
		if (mlockall(flags) < 0)
			info("failed to mlock() slurmstepd pages: %m");
		else
			debug("slurmstepd locked in memory");
#else
		info("mlockall() system call does not appear to be available");
#endif
	}
	xfree(launch_params);

	/* This does most of the stdio setup, then launches all the tasks,
	 * and blocks until the step is complete */
	rc = job_manager(job);

	return stepd_cleanup(msg, job, cli, self, rc, 0);
ending:
	return stepd_cleanup(msg, job, cli, self, rc, 1);
}