Example #1
0
static void *_pty_thread(void *arg)
{
	int fd = -1;
	srun_job_t *job = (srun_job_t *) arg;
	slurm_addr_t client_addr;

	xsignal_unblock(pty_sigarray);
	xsignal(SIGWINCH, _handle_sigwinch);

	if ((fd = slurm_accept_msg_conn(job->pty_fd, &client_addr)) < 0) {
		error("pty: accept failure: %m");
		return NULL;
	}

	while (job->state <= SRUN_JOB_RUNNING) {
		debug2("waiting for SIGWINCH");
		poll(NULL, 0, -1);
		if (winch) {
			set_winsize(job);
			_notify_winsize_change(fd, job);
		}
		winch = 0;
	}
	return NULL;
}
Example #2
0
extern void slurm_persist_conn_recv_server_init(void)
{
	int sigarray[] = {SIGUSR1, 0};

	shutdown_time = 0;

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmdbd signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);
}
Example #3
0
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
					 void (*signal_function)(int),
					 sig_atomic_t *destroy_job)
{
	int i, rc;
	unsigned long step_wait = 0, my_sleep = 0;
	time_t begin_time;
	uint16_t base_dist;

	if (!job) {
		error("launch_common_create_job_step: no job given");
		return SLURM_ERROR;
	}

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.uid = opt.uid;

	/* Validate minimum and maximum node counts */
	if (opt.min_nodes && opt.max_nodes &&
	    (opt.min_nodes > opt.max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt.min_nodes, opt.max_nodes);
		return SLURM_ERROR;
	}
#if !defined HAVE_FRONT_END || (defined HAVE_BGQ)
//#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES)
	if (opt.min_nodes && (opt.min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt.min_nodes, job->nhosts);
		return SLURM_ERROR;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt.min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt.max_nodes;

	if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL))
		job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node;
	job->ctx_params.task_count = opt.ntasks;

	if (opt.mem_per_cpu != NO_VAL)
		job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
	else if (opt.pn_min_memory != NO_VAL)
		job->ctx_params.pn_min_memory = opt.pn_min_memory;
	if (opt.gres)
		job->ctx_params.gres = opt.gres;
	else
		job->ctx_params.gres = getenv("SLURM_STEP_GRES");

	if (opt.overcommit) {
		if (use_all_cpus)	/* job allocation created by srun */
			job->ctx_params.cpu_count = job->cpu_count;
		else
			job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	} else if (opt.cpus_set) {
		job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task;
	} else if (opt.ntasks_set) {
		job->ctx_params.cpu_count = opt.ntasks;
	} else if (use_all_cpus) {	/* job allocation created by srun */
		job->ctx_params.cpu_count = job->cpu_count;
	} else {
		job->ctx_params.cpu_count = opt.ntasks;
	}

	job->ctx_params.cpu_freq_min = opt.cpu_freq_min;
	job->ctx_params.cpu_freq_max = opt.cpu_freq_max;
	job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov;
	job->ctx_params.relative = (uint16_t)opt.relative;
	job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval;
	job->ctx_params.ckpt_dir = opt.ckpt_dir;
	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
	if (opt.immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt.immediate;
	if (opt.time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt.time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (opt.resv_port_cnt != NO_VAL)
		job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt;
	else {
#if defined(HAVE_NATIVE_CRAY)
		/*
		 * On Cray systems default to reserving one port, or one
		 * more than the number of multi prog commands, for Cray PMI
		 */
		job->ctx_params.resv_port_cnt = (opt.multi_prog ?
						 opt.multi_prog_cmds + 1 : 1);
#endif
	}

	switch (opt.distribution & SLURM_DIST_STATE_BASE) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_CFULL:
	case SLURM_DIST_BLOCK_CFULL:
		job->ctx_params.task_dist = opt.distribution;
		if (opt.ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt.ntasks_per_node;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt.plane_size;
		break;
	default:
		base_dist = (job->ctx_params.task_count <=
			     job->ctx_params.min_nodes)
			     ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt.distribution &= SLURM_DIST_STATE_FLAGS;
		opt.distribution |= base_dist;
		job->ctx_params.task_dist = opt.distribution;
		if (opt.ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt.ntasks_per_node;
		break;

	}
	job->ctx_params.overcommit = opt.overcommit ? 1 : 0;

	job->ctx_params.node_list = opt.nodelist;

	job->ctx_params.network = opt.network;
	job->ctx_params.no_kill = opt.no_kill;
	if (opt.job_name_set_cmd && opt.job_name)
		job->ctx_params.name = opt.job_name;
	else
		job->ctx_params.name = opt.cmd_name;
	job->ctx_params.features = opt.constraints;

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);
	begin_time = time(NULL);

	for (i=0; (!(*destroy_job)); i++) {
		bool blocking_step_create = true;
		if (opt.no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else if (opt.immediate) {
			job->step_ctx = slurm_step_ctx_create(
				&job->ctx_params);
		} else {
			/* Wait 60 to 70 seconds for response */
			step_wait = (getpid() % 10) * 1000 + 60000;
			job->step_ctx = slurm_step_ctx_create_timeout(
						&job->ctx_params, step_wait);
		}
		if (job->step_ctx != NULL) {
			if (i > 0)
				info("Job step created");

			break;
		}
		rc = slurm_get_errno();

		if (((opt.immediate != 0) &&
		     ((opt.immediate == 1) ||
		      (difftime(time(NULL), begin_time) > opt.immediate))) ||
		    ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) &&
		     (rc != ESLURM_PROLOG_RUNNING) &&
		     (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) &&
		     (rc != ESLURM_INTERCONNECT_BUSY) &&
		     (rc != ESLURM_DISABLED))) {
			error ("Unable to create job step: %m");
			return SLURM_ERROR;
		}
		if (rc == ESLURM_DISABLED)	/* job suspended */
			blocking_step_create = false;

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job step creation temporarily disabled, "
				     "retrying");
			}
			xsignal_unblock(sig_array);
			for (i = 0; sig_array[i]; i++)
				xsignal(sig_array[i], signal_function);
			if (!blocking_step_create)
				my_sleep = (getpid() % 1000) * 100 + 100000;
		} else {
			verbose("Job step creation still disabled, retrying");
			if (!blocking_step_create)
				my_sleep *= 2;
		}
		if (!blocking_step_create) {
			/* sleep 0.1 to 29 secs with exponential back-off */
			my_sleep = MIN(my_sleep, 29000000);
			usleep(my_sleep);
		}
		if (*destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (*destroy_job) {
			info("Cancelled pending job step");
			return SLURM_ERROR;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job);

	return SLURM_SUCCESS;
}
Example #4
0
/* _background_rpc_mgr - Read and process incoming RPCs to the background
 *	controller (that's us) */
static void *_background_rpc_mgr(void *no_data)
{
	slurm_fd_t newsockfd;
	slurm_fd_t sockfd;
	slurm_addr_t cli_addr;
	slurm_msg_t *msg = NULL;
	int error_code;
	char* node_addr = NULL;

	/* Read configuration only */
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	int sigarray[] = {SIGUSR1, 0};

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
	debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid());

	/* initialize port for RPCs */
	lock_slurmctld(config_read_lock);

	/* set node_addr to bind to (NULL means any) */
	if ((strcmp(slurmctld_conf.backup_controller,
		    slurmctld_conf.backup_addr) != 0)) {
		node_addr = slurmctld_conf.backup_addr ;
	}

	if ((sockfd =
	     slurm_init_msg_engine_addrname_port(node_addr,
						 slurmctld_conf.
						 slurmctld_port))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_addrname_port error %m");
	unlock_slurmctld(config_read_lock);

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmctld signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs indefinitely
	 */
	while (slurmctld_config.shutdown_time == 0) {
		/* accept needed for stream implementation
		 * is a no-op in message implementation that just passes
		 * sockfd to newsockfd */
		if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr))
		    == SLURM_SOCKET_ERROR) {
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}

		msg = xmalloc(sizeof(slurm_msg_t));
		slurm_msg_t_init(msg);
		if (slurm_receive_msg(newsockfd, msg, 0) != 0)
			error("slurm_receive_msg: %m");

		error_code = _background_process_msg(msg);
		if ((error_code == SLURM_SUCCESS)			&&
		    (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE)	&&
		    (slurmctld_config.shutdown_time == 0))
			slurmctld_config.shutdown_time = time(NULL);

		slurm_free_msg_data(msg->msg_type, msg->data);
		slurm_free_msg(msg);

		slurm_close(newsockfd);	/* close new socket */
	}

	debug3("_background_rpc_mgr shutting down");
	slurm_close(sockfd);	/* close the main socket */
	pthread_exit((void *) 0);
	return NULL;
}
Example #5
0
extern int
create_job_step(srun_job_t *job, bool use_all_cpus)
{
	int i, rc;
	unsigned long my_sleep = 0;
	time_t begin_time;

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.uid = opt.uid;

	/* set the jobid for totalview */
	totalview_jobid = NULL;
	xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id);

	/* Validate minimum and maximum node counts */
	if (opt.min_nodes && opt.max_nodes &&
	    (opt.min_nodes > opt.max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt.min_nodes, opt.max_nodes);
		return -1;
	}
#if !defined HAVE_FRONT_END || (defined HAVE_BGQ)
//#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES)
	if (opt.min_nodes && (opt.min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt.min_nodes, job->nhosts);
		return -1;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt.min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt.max_nodes;

	if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL))
		job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node;
	job->ctx_params.task_count = opt.ntasks;

	if (opt.mem_per_cpu != NO_VAL)
		job->ctx_params.mem_per_cpu = opt.mem_per_cpu;
	job->ctx_params.gres = opt.gres;

	if (use_all_cpus)
		job->ctx_params.cpu_count = job->cpu_count;
	else if (opt.overcommit)
		job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	else
		job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task;

	job->ctx_params.relative = (uint16_t)opt.relative;
	job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval;
	job->ctx_params.ckpt_dir = opt.ckpt_dir;
	job->ctx_params.gres = opt.gres;
	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
	if (opt.immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt.immediate;
	if (opt.time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt.time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (opt.resv_port_cnt != NO_VAL)
		job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt;

	switch (opt.distribution) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
		job->ctx_params.task_dist = opt.distribution;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt.plane_size;
		break;
	default:
		job->ctx_params.task_dist = (job->ctx_params.task_count <=
					     job->ctx_params.min_nodes)
			? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt.distribution = job->ctx_params.task_dist;
		break;

	}
	job->ctx_params.overcommit = opt.overcommit ? 1 : 0;

	job->ctx_params.node_list = opt.nodelist;

	job->ctx_params.network = opt.network;
	job->ctx_params.no_kill = opt.no_kill;
	if (opt.job_name_set_cmd && opt.job_name)
		job->ctx_params.name = opt.job_name;
	else
		job->ctx_params.name = opt.cmd_name;

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);
	begin_time = time(NULL);

	for (i=0; (!destroy_job); i++) {
		if (opt.no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else
			job->step_ctx = slurm_step_ctx_create(
				&job->ctx_params);
		if (job->step_ctx != NULL) {
			if (i > 0)
				info("Job step created");

			break;
		}
		rc = slurm_get_errno();

		if (((opt.immediate != 0) &&
		     ((opt.immediate == 1) ||
		      (difftime(time(NULL), begin_time) > opt.immediate))) ||
		    ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) &&
		     (rc != ESLURM_PROLOG_RUNNING) &&
		     (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) &&
		     (rc != ESLURM_DISABLED))) {
			error ("Unable to create job step: %m");
			return -1;
		}

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job step creation temporarily disabled, "
				     "retrying");
			}
			xsignal_unblock(sig_array);
			for (i = 0; sig_array[i]; i++)
				xsignal(sig_array[i], _signal_while_allocating);

			my_sleep = (getpid() % 1000) * 100 + 100000;
		} else {
			verbose("Job step creation still disabled, retrying");
			my_sleep = MIN((my_sleep * 2), 29000000);
		}
		/* sleep 0.1 to 29 secs with exponential back-off */
		usleep(my_sleep);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (destroy_job) {
			info("Cancelled pending job step");
			return -1;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job);

	return 0;
}
Example #6
0
resource_allocation_response_msg_t *
allocate_nodes(void)
{
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j = job_desc_msg_create_from_opts();
	slurm_allocation_callbacks_t callbacks;
	int i;

	if (!j)
		return NULL;

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt.jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	xsignal_unblock(sig_array);
	for (i = 0; sig_array[i]; i++)
		xsignal(sig_array[i], _signal_while_allocating);

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j, opt.immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;
#ifdef HAVE_BG
		if (!_wait_bluegene_block_ready(resp)) {
			if(!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		if (!_wait_nodes_ready(resp)) {
			if(!destroy_job)
				error("Something is wrong with the "
				      "boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:

	slurm_free_resource_allocation_response_msg(resp);
	if (!destroy_job)
		slurm_complete_job(resp->job_id, 1);
	exit(error_exit);
	return NULL;
}
Example #7
0
resource_allocation_response_msg_t *
allocate_nodes(bool handle_signals)
{
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j = job_desc_msg_create_from_opts();
	slurm_allocation_callbacks_t callbacks;
	int i;

	if (!j)
		return NULL;

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt.jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j, opt.immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;

		/*
		 * These values could be changed while the job was
		 * pending so overwrite the request with what was
		 * allocated so we don't have issues when we use them
		 * in the step creation.
		 */
		if (opt.pn_min_memory != NO_VAL)
			opt.pn_min_memory = (resp->pn_min_memory &
					     (~MEM_PER_CPU));
		else if (opt.mem_per_cpu != NO_VAL)
			opt.mem_per_cpu = (resp->pn_min_memory &
					   (~MEM_PER_CPU));
		/*
		 * FIXME: timelimit should probably also be updated
		 * here since it could also change.
		 */

#ifdef HAVE_BG
		uint32_t node_cnt = 0;
		select_g_select_jobinfo_get(resp->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
		if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
			opt.min_nodes = node_cnt;
			opt.max_nodes = node_cnt;
		} /* else we just use the original request */

		if (!_wait_bluegene_block_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		opt.min_nodes = resp->node_cnt;
		opt.max_nodes = resp->node_cnt;

		if (!_wait_nodes_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:
	if (resp) {
		if (!destroy_job)
			slurm_complete_job(resp->job_id, 1);
		slurm_free_resource_allocation_response_msg(resp);
	}
	exit(error_exit);
	return NULL;
}
Example #8
0
/* Process incoming RPCs. Meant to execute as a pthread */
extern void *rpc_mgr(void *no_data)
{
	pthread_attr_t thread_attr_rpc_req;
	slurm_fd_t sockfd, newsockfd;
	int i, retry_cnt, sigarray[] = {SIGUSR1, 0};
	slurm_addr_t cli_addr;
	slurmdbd_conn_t *conn_arg = NULL;

	slurm_mutex_lock(&thread_count_lock);
	master_thread_id = pthread_self();
	slurm_mutex_unlock(&thread_count_lock);

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* threads to process individual RPC's are detached */
	slurm_attr_init(&thread_attr_rpc_req);
	if (pthread_attr_setdetachstate
	    (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED))
		fatal("pthread_attr_setdetachstate %m");

	/* initialize port for RPCs */
	if ((sockfd = slurm_init_msg_engine_port(get_dbd_port()))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_port error %m");

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmdbd signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs until told to shutdown
	 */
	while ((i = _wait_for_server_thread()) >= 0) {
		/*
		 * accept needed for stream implementation is a no-op in
		 * message implementation that just passes sockfd to newsockfd
		 */
		if ((newsockfd = slurm_accept_msg_conn(sockfd,
						       &cli_addr)) ==
		    SLURM_SOCKET_ERROR) {
			_free_server_thread((pthread_t) 0);
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		fd_set_nonblocking(newsockfd);

		conn_arg = xmalloc(sizeof(slurmdbd_conn_t));
		conn_arg->newsockfd = newsockfd;
		slurm_get_ip_str(&cli_addr, &conn_arg->orig_port,
				 conn_arg->ip, sizeof(conn_arg->ip));
		retry_cnt = 0;
		while (pthread_create(&slave_thread_id[i],
				      &thread_attr_rpc_req,
				      _service_connection,
				      (void *) conn_arg)) {
			if (retry_cnt > 0) {
				error("pthread_create failure, "
				      "aborting RPC: %m");
				close(newsockfd);
				break;
			}
			error("pthread_create failure: %m");
			retry_cnt++;
			usleep(1000);	/* retry in 1 msec */
		}
	}

	debug3("rpc_mgr shutting down");
	slurm_attr_destroy(&thread_attr_rpc_req);
	(void) slurm_shutdown_msg_engine(sockfd);
	_wait_for_thread_fini();
	pthread_exit((void *) 0);
	return NULL;
}
Example #9
0
/*
 * _thread_per_group_rpc - thread to issue an RPC for a group of nodes
 *                         sending message out to one and forwarding it to
 *                         others if necessary.
 * IN/OUT args - pointer to task_info_t, xfree'd on completion
 */
static void *_thread_per_group_rpc(void *args)
{
	int rc = SLURM_SUCCESS;
	slurm_msg_t msg;
	task_info_t *task_ptr = (task_info_t *) args;
	/* we cache some pointers from task_info_t because we need
	 * to xfree args before being finished with their use. xfree
	 * is required for timely termination of this pthread because
	 * xfree could lock it at the end, preventing a timely
	 * thread_exit */
	pthread_mutex_t *thread_mutex_ptr   = task_ptr->thread_mutex_ptr;
	pthread_cond_t  *thread_cond_ptr    = task_ptr->thread_cond_ptr;
	uint32_t        *threads_active_ptr = task_ptr->threads_active_ptr;
	thd_t           *thread_ptr         = task_ptr->thread_struct_ptr;
	state_t thread_state = DSH_NO_RESP;
	slurm_msg_type_t msg_type = task_ptr->msg_type;
	bool is_kill_msg, srun_agent;
	List ret_list = NULL;
	ListIterator itr;
	ret_data_info_t *ret_data_info = NULL;
	int found = 0;
	int sig_array[2] = {SIGUSR1, 0};
	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };

	xassert(args != NULL);
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sig_array);
	is_kill_msg = (	(msg_type == REQUEST_KILL_TIMELIMIT)	||
			(msg_type == REQUEST_TERMINATE_JOB) );
	srun_agent = (	(msg_type == SRUN_PING)			||
			(msg_type == SRUN_EXEC)			||
			(msg_type == SRUN_JOB_COMPLETE)		||
			(msg_type == SRUN_STEP_MISSING)		||
			(msg_type == SRUN_TIMEOUT)		||
			(msg_type == SRUN_USER_MSG)		||
			(msg_type == RESPONSE_RESOURCE_ALLOCATION) ||
			(msg_type == SRUN_NODE_FAIL) );

	thread_ptr->start_time = time(NULL);

	slurm_mutex_lock(thread_mutex_ptr);
	thread_ptr->state = DSH_ACTIVE;
	thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT;
	slurm_mutex_unlock(thread_mutex_ptr);

	/* send request message */
	slurm_msg_t_init(&msg);
	msg.msg_type = msg_type;
	msg.data     = task_ptr->msg_args_ptr;
#if 0
 	info("sending message type %u to %s", msg_type, thread_ptr->nodelist);
#endif
	if (task_ptr->get_reply) {
		if(thread_ptr->addr) {
			msg.address = *thread_ptr->addr;

			if(!(ret_list = slurm_send_addr_recv_msgs(
				     &msg, thread_ptr->nodelist, 0))) {
				error("_thread_per_group_rpc: "
				      "no ret_list given");
				goto cleanup;
			}


		} else {
			if(!(ret_list = slurm_send_recv_msgs(
				     thread_ptr->nodelist,
				     &msg, 0, true))) {
				error("_thread_per_group_rpc: "
				      "no ret_list given");
				goto cleanup;
			}
		}
	} else {
		if(thread_ptr->addr) {
			//info("got the address");
			msg.address = *thread_ptr->addr;
		} else {
			//info("no address given");
			if(slurm_conf_get_addr(thread_ptr->nodelist,
					       &msg.address) == SLURM_ERROR) {
				error("_thread_per_group_rpc: "
				      "can't find address for host %s, "
				      "check slurm.conf",
				      thread_ptr->nodelist);
				goto cleanup;
			}
		}
		//info("sending %u to %s", msg_type, thread_ptr->nodelist);
		if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) {
			thread_state = DSH_DONE;
		} else {
			if (!srun_agent)
				_comm_err(thread_ptr->nodelist, msg_type);
		}
		goto cleanup;
	}

	//info("got %d messages back", list_count(ret_list));
	found = 0;
	itr = list_iterator_create(ret_list);
	while ((ret_data_info = list_next(itr)) != NULL) {
		rc = slurm_get_return_code(ret_data_info->type,
					   ret_data_info->data);
		/* SPECIAL CASE: Mark node as IDLE if job already
		   complete */
		if (is_kill_msg &&
		    (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) {
			kill_job_msg_t *kill_job;
			kill_job = (kill_job_msg_t *)
				task_ptr->msg_args_ptr;
			rc = SLURM_SUCCESS;
			lock_slurmctld(job_write_lock);
			if (job_epilog_complete(kill_job->job_id,
						ret_data_info->
						node_name,
						rc))
				run_scheduler = true;
			unlock_slurmctld(job_write_lock);
		}
		/* SPECIAL CASE: Kill non-startable batch job,
		 * Requeue the job on ESLURMD_PROLOG_FAILED */
		if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) &&
		    (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) &&
		    (ret_data_info->type != RESPONSE_FORWARD_FAILED)) {
			batch_job_launch_msg_t *launch_msg_ptr =
				task_ptr->msg_args_ptr;
			uint32_t job_id = launch_msg_ptr->job_id;
			info("Killing non-startable batch job %u: %s",
			     job_id, slurm_strerror(rc));
			thread_state = DSH_DONE;
			ret_data_info->err = thread_state;
			lock_slurmctld(job_write_lock);
			job_complete(job_id, 0, false, false, _wif_status());
			unlock_slurmctld(job_write_lock);
			continue;
		}

		if (((msg_type == REQUEST_SIGNAL_TASKS) ||
		     (msg_type == REQUEST_TERMINATE_TASKS)) &&
		     (rc == ESRCH)) {
			/* process is already dead, not a real error */
			rc = SLURM_SUCCESS;
		}

		switch (rc) {
		case SLURM_SUCCESS:
			/* debug("agent processed RPC to node %s", */
			/*       ret_data_info->node_name); */
			thread_state = DSH_DONE;
			break;
		case SLURM_UNKNOWN_FORWARD_ADDR:
			error("We were unable to forward message to '%s'.  "
			      "Make sure the slurm.conf for each slurmd "
			      "contain all other nodes in your system.",
			      ret_data_info->node_name);
			thread_state = DSH_NO_RESP;
			break;
		case ESLURMD_EPILOG_FAILED:
			error("Epilog failure on host %s, "
			      "setting DOWN",
			      ret_data_info->node_name);

			thread_state = DSH_FAILED;
			break;
		case ESLURMD_PROLOG_FAILED:
			thread_state = DSH_FAILED;
			break;
		case ESLURM_INVALID_JOB_ID:
			/* Not indicative of a real error */
		case ESLURMD_JOB_NOTRUNNING:
			/* Not indicative of a real error */
			debug2("agent processed RPC to node %s: %s",
			       ret_data_info->node_name,
			       slurm_strerror(rc));

			thread_state = DSH_DONE;
			break;
		default:
			if (!srun_agent) {
				if (ret_data_info->err)
					errno = ret_data_info->err;
				else
					errno = rc;
				rc = _comm_err(ret_data_info->node_name,
					       msg_type);
			}
			if (srun_agent)
				thread_state = DSH_FAILED;
			else if(ret_data_info->type == RESPONSE_FORWARD_FAILED)
				/* check if a forward failed */
				thread_state = DSH_NO_RESP;
			else {	/* some will fail that don't mean anything went
				 * bad like a job term request on a job that is
				 * already finished, we will just exit on those
				 * cases */
				thread_state = DSH_DONE;
			}
		}
		ret_data_info->err = thread_state;
	}
	list_iterator_destroy(itr);

cleanup:
	xfree(args);

	/* handled at end of thread just in case resend is needed */
	destroy_forward(&msg.forward);
	slurm_mutex_lock(thread_mutex_ptr);
	thread_ptr->ret_list = ret_list;
	thread_ptr->state = thread_state;
	thread_ptr->end_time = (time_t) difftime(time(NULL),
						 thread_ptr->start_time);
	/* Signal completion so another thread can replace us */
	(*threads_active_ptr)--;
	pthread_cond_signal(thread_cond_ptr);
	slurm_mutex_unlock(thread_mutex_ptr);
	return (void *) NULL;
}
Example #10
0
static void *_agent(void *x)
{
	int cnt, rc;
	Buf buffer;
	struct timespec abs_time;
	static time_t fail_time = 0;
	int sigarray[] = {SIGUSR1, 0};
	slurmdbd_msg_t list_req;
	dbd_list_msg_t list_msg;

	list_req.msg_type = DBD_SEND_MULT_MSG;
	list_req.data = &list_msg;
	memset(&list_msg, 0, sizeof(dbd_list_msg_t));
	/* DEF_TIMERS; */

	/* Prepare to catch SIGUSR1 to interrupt pending
	 * I/O and terminate in a timely fashion. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	while (*slurmdbd_conn->shutdown == 0) {
		/* START_TIMER; */
		slurm_mutex_lock(&slurmdbd_lock);
		if (halt_agent)
			slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock);

		if ((slurmdbd_conn->fd < 0) &&
		    (difftime(time(NULL), fail_time) >= 10)) {
			/* The connection to Slurm DBD is not open */
			_open_slurmdbd_conn(1);
			if (slurmdbd_conn->fd < 0)
				fail_time = time(NULL);
		}

		slurm_mutex_lock(&agent_lock);
		if (agent_list && slurmdbd_conn->fd)
			cnt = list_count(agent_list);
		else
			cnt = 0;
		if ((cnt == 0) || (slurmdbd_conn->fd < 0) ||
		    (fail_time && (difftime(time(NULL), fail_time) < 10))) {
			slurm_mutex_unlock(&slurmdbd_lock);
			abs_time.tv_sec  = time(NULL) + 10;
			abs_time.tv_nsec = 0;
			slurm_cond_timedwait(&agent_cond, &agent_lock,
					     &abs_time);
			slurm_mutex_unlock(&agent_lock);
			continue;
		} else if ((cnt > 0) && ((cnt % 100) == 0))
			info("slurmdbd: agent queue size %u", cnt);
		/* Leave item on the queue until processing complete */
		if (agent_list) {
			int handle_agent_count = 1000;
			if (cnt > handle_agent_count) {
				int agent_count = 0;
				ListIterator agent_itr =
					list_iterator_create(agent_list);
				list_msg.my_list = list_create(NULL);
				while ((buffer = list_next(agent_itr))) {
					list_enqueue(list_msg.my_list, buffer);
					agent_count++;
					if (agent_count > handle_agent_count)
						break;
				}
				list_iterator_destroy(agent_itr);
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else if (cnt > 1) {
				list_msg.my_list = agent_list;
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else
				buffer = (Buf) list_peek(agent_list);
		} else
			buffer = NULL;
		slurm_mutex_unlock(&agent_lock);
		if (buffer == NULL) {
			slurm_mutex_unlock(&slurmdbd_lock);

			slurm_mutex_lock(&assoc_cache_mutex);
			if (slurmdbd_conn->fd >= 0 && running_cache)
				slurm_cond_signal(&assoc_cache_cond);
			slurm_mutex_unlock(&assoc_cache_mutex);

			continue;
		}

		/* NOTE: agent_lock is clear here, so we can add more
		 * requests to the queue while waiting for this RPC to
		 * complete. */
		rc = slurm_persist_send_msg(slurmdbd_conn, buffer);
		if (rc != SLURM_SUCCESS) {
			if (*slurmdbd_conn->shutdown) {
				slurm_mutex_unlock(&slurmdbd_lock);
				break;
			}
			error("slurmdbd: Failure sending message: %d: %m", rc);
		} else if (list_msg.my_list) {
			rc = _handle_mult_rc_ret();
		} else {
			rc = _get_return_code();
			if (rc == EAGAIN) {
				if (*slurmdbd_conn->shutdown) {
					slurm_mutex_unlock(&slurmdbd_lock);
					break;
				}
				error("slurmdbd: Failure with "
				      "message need to resend: %d: %m", rc);
			}
		}
		slurm_mutex_unlock(&slurmdbd_lock);
		slurm_mutex_lock(&assoc_cache_mutex);
		if (slurmdbd_conn->fd >= 0 && running_cache)
			slurm_cond_signal(&assoc_cache_cond);
		slurm_mutex_unlock(&assoc_cache_mutex);

		slurm_mutex_lock(&agent_lock);
		if (agent_list && (rc == SLURM_SUCCESS)) {
			/*
			 * If we sent a mult_msg we just need to free buffer,
			 * we don't need to requeue, just mark list_msg.my_list
			 * as NULL as that is the sign we sent a mult_msg.
			 */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
			} else
				buffer = (Buf) list_dequeue(agent_list);

			free_buf(buffer);
			fail_time = 0;
		} else {
			/* We need to free a mult_msg even on failure */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
				free_buf(buffer);
			}

			fail_time = time(NULL);
		}
		slurm_mutex_unlock(&agent_lock);
		/* END_TIMER; */
		/* info("at the end with %s", TIME_STR); */
	}

	slurm_mutex_lock(&agent_lock);
	_save_dbd_state();
	FREE_NULL_LIST(agent_list);
	slurm_mutex_unlock(&agent_lock);
	return NULL;
}
Example #11
0
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
					 void (*signal_function)(int),
					 sig_atomic_t *destroy_job,
					 slurm_opt_t *opt_local)
{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	int i, j, rc;
	unsigned long step_wait = 0;
	uint16_t base_dist, slurmctld_timeout;
	char *add_tres;
	xassert(srun_opt);

	if (!job) {
		error("launch_common_create_job_step: no job given");
		return SLURM_ERROR;
	}

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.step_id = job->stepid;
	job->ctx_params.uid = opt_local->uid;

	/* Validate minimum and maximum node counts */
	if (opt_local->min_nodes && opt_local->max_nodes &&
	    (opt_local->min_nodes > opt_local->max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt_local->min_nodes, opt_local->max_nodes);
		return SLURM_ERROR;
	}
#if !defined HAVE_FRONT_END
	if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt_local->min_nodes, job->nhosts);
		return SLURM_ERROR;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt_local->min_nodes &&
	    (opt_local->min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt_local->min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt_local->max_nodes &&
	    (opt_local->max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt_local->max_nodes;

	if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL))
		job->ntasks = opt_local->ntasks = job->nhosts *
						  opt_local->ntasks_per_node;
	job->ctx_params.task_count = opt_local->ntasks;

	if (opt_local->mem_per_cpu != NO_VAL64)
		job->ctx_params.pn_min_memory = opt_local->mem_per_cpu |
						MEM_PER_CPU;
	else if (opt_local->pn_min_memory != NO_VAL64)
		job->ctx_params.pn_min_memory = opt_local->pn_min_memory;

	if (opt_local->overcommit) {
		if (use_all_cpus)	/* job allocation created by srun */
			job->ctx_params.cpu_count = job->cpu_count;
		else
			job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	} else if (opt_local->cpus_set) {
		job->ctx_params.cpu_count = opt_local->ntasks *
					    opt_local->cpus_per_task;
	} else if (opt_local->ntasks_set) {
		job->ctx_params.cpu_count = opt_local->ntasks;
	} else if (use_all_cpus) {	/* job allocation created by srun */
		job->ctx_params.cpu_count = job->cpu_count;
	} else {
		job->ctx_params.cpu_count = opt_local->ntasks;
	}

	job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min;
	job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max;
	job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov;
	job->ctx_params.relative = (uint16_t)srun_opt->relative;
	job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval;
	job->ctx_params.ckpt_dir = srun_opt->ckpt_dir;
	job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive;
	if (opt_local->immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt_local->immediate;
	if (opt_local->time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt_local->time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (srun_opt->resv_port_cnt != NO_VAL) {
		job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt;
	} else {
#if defined(HAVE_NATIVE_CRAY)
		/*
		 * On Cray systems default to reserving one port, or one
		 * more than the number of multi prog commands, for Cray PMI
		 */
		job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ?
					srun_opt->multi_prog_cmds + 1 : 1);
#endif
	}

	switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_CFULL:
	case SLURM_DIST_BLOCK_CFULL:
		job->ctx_params.task_dist = opt_local->distribution;
		if (opt_local->ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt_local->ntasks_per_node;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt_local->plane_size;
		break;
	default:
		/* Leave distribution set to unknown if taskcount <= nodes and
		 * memory is set to 0. step_mgr will handle the 0mem case.
		 * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */
		if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory)
			base_dist = SLURM_DIST_UNKNOWN;
		else
			base_dist = (job->ctx_params.task_count <=
				     job->ctx_params.min_nodes)
				     ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt_local->distribution &= SLURM_DIST_STATE_FLAGS;
		opt_local->distribution |= base_dist;
		job->ctx_params.task_dist = opt_local->distribution;
		if (opt_local->ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt_local->ntasks_per_node;
		break;

	}
	job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0;
	job->ctx_params.node_list = opt_local->nodelist;
	job->ctx_params.network = opt_local->network;
	job->ctx_params.no_kill = opt_local->no_kill;
	if (srun_opt->job_name_set_cmd && opt_local->job_name)
		job->ctx_params.name = opt_local->job_name;
	else
		job->ctx_params.name = srun_opt->cmd_name;
	job->ctx_params.features = opt_local->constraints;

	if (opt_local->cpus_per_gpu) {
		xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d",
			   opt_local->cpus_per_gpu);
	}
	xfree(opt_local->tres_bind);	/* Vestigial value from job allocate */
	if (opt_local->gpu_bind)
		xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind);
	if (tres_bind_verify_cmdline(opt_local->tres_bind)) {
		if (tres_bind_err_log) {	/* Log once */
			error("Invalid --tres-bind argument: %s. Ignored",
			      opt_local->tres_bind);
			tres_bind_err_log = false;
		}
		xfree(opt_local->tres_bind);
	}
	job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind);
	xfree(opt_local->tres_freq);	/* Vestigial value from job allocate */
	xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq);
	if (tres_freq_verify_cmdline(opt_local->tres_freq)) {
		if (tres_freq_err_log) {	/* Log once */
			error("Invalid --tres-freq argument: %s. Ignored",
			      opt_local->tres_freq);
			tres_freq_err_log = false;
		}
		xfree(opt_local->tres_freq);
	}
	job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq);
	job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job);
	xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus);
	xfmt_tres(&job->ctx_params.tres_per_node, "gpu",
		  opt_local->gpus_per_node);
	if (opt_local->gres)
		add_tres = opt_local->gres;
	else
		add_tres = getenv("SLURM_STEP_GRES");
	if (add_tres) {
		if (job->ctx_params.tres_per_node) {
			xstrfmtcat(job->ctx_params.tres_per_node, ",%s",
				   add_tres);
		} else
			job->ctx_params.tres_per_node = xstrdup(add_tres);
	}
	xfmt_tres(&job->ctx_params.tres_per_socket, "gpu",
		  opt_local->gpus_per_socket);
	xfmt_tres(&job->ctx_params.tres_per_task, "gpu",
		  opt_local->gpus_per_task);
	if (opt_local->mem_per_gpu) {
		xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64,
			   opt.mem_per_gpu);
	}

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);

	for (i = 0; (!(*destroy_job)); i++) {
		if (srun_opt->no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else {
			if (opt_local->immediate) {
				step_wait = MAX(1, opt_local->immediate -
						   difftime(time(NULL),
							    srun_begin_time)) *
					    1000;
			} else {
				slurmctld_timeout = MIN(300, MAX(60,
					slurm_get_slurmctld_timeout()));
				step_wait = ((getpid() % 10) +
					     slurmctld_timeout) * 1000;
			}
			job->step_ctx = slurm_step_ctx_create_timeout(
						&job->ctx_params, step_wait);
		}
		if (job->step_ctx != NULL) {
			if (i > 0) {
				info("Step created for job %u",
				     job->ctx_params.job_id);
			}
			break;
		}
		rc = slurm_get_errno();

		if (((opt_local->immediate != 0) &&
		     ((opt_local->immediate == 1) ||
		      (difftime(time(NULL), srun_begin_time) >=
		       opt_local->immediate))) ||
		    ((rc != ESLURM_PROLOG_RUNNING) &&
		     !slurm_step_retry_errno(rc))) {
			error("Unable to create step for job %u: %m",
			      job->ctx_params.job_id);
			return SLURM_ERROR;
		}

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job %u step creation temporarily disabled, retrying",
				     job->ctx_params.job_id);
			}
			xsignal_unblock(sig_array);
			for (j = 0; sig_array[j]; j++)
				xsignal(sig_array[j], signal_function);
		} else {
			verbose("Job %u step creation still disabled, retrying",
				job->ctx_params.job_id);
		}

		if (*destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (*destroy_job) {
			info("Cancelled pending step for job %u",
			     job->ctx_params.job_id);
			return SLURM_ERROR;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*
	 *  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job, opt_local);

	return SLURM_SUCCESS;
}
Example #12
0
/*
 * Allocate nodes for heterogeneous/pack job from the slurm controller -- 
 * retrying the attempt if the controller appears to be down, and optionally
 * waiting for resources if none are currently available (see opt.immediate)
 *
 * Returns a pointer to a resource_allocation_response_msg which must
 * be freed with slurm_free_resource_allocation_response_msg()
 */
List allocate_pack_nodes(bool handle_signals)
{
	resource_allocation_response_msg_t *resp = NULL;
	bool jobid_log = true;
	job_desc_msg_t *j, *first_job = NULL;
	slurm_allocation_callbacks_t callbacks;
	ListIterator opt_iter, resp_iter;
	slurm_opt_t *opt_local, *first_opt = NULL;
	List job_req_list = NULL, job_resp_list = NULL;
	uint32_t my_job_id = 0;
	int i, k;

	job_req_list = list_create(NULL);
	opt_iter = list_iterator_create(opt_list);
	while ((opt_local = list_next(opt_iter))) {
		srun_opt_t *srun_opt = opt_local->srun_opt;
		xassert(srun_opt);
		if (!first_opt)
			first_opt = opt_local;
		if (srun_opt->relative_set && srun_opt->relative)
			fatal("--relative option invalid for job allocation request");

		if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL)
			return NULL;
		if (!first_job)
			first_job = j;

		j->origin_cluster = xstrdup(slurmctld_conf.cluster_name);

		/* Do not re-use existing job id when submitting new job
		 * from within a running job */
		if ((j->job_id != NO_VAL) && !opt_local->jobid_set) {
			if (jobid_log) {
				jobid_log = false;	/* log once */
				info("WARNING: Creating SLURM job allocation from within "
				     "another allocation");
				info("WARNING: You are attempting to initiate a second job");
			}
			if (!opt_local->jobid_set) /* Let slurmctld set jobid */
				j->job_id = NO_VAL;
		}

		list_append(job_req_list, j);
	}
	list_iterator_destroy(opt_iter);

	if (!first_job) {
		error("%s: No job requests found", __func__);
		return NULL;
	}

	if (first_opt && first_opt->clusters &&
	    (slurmdb_get_first_pack_cluster(job_req_list, first_opt->clusters,
					    &working_cluster_rec)
	     != SLURM_SUCCESS)) {
		print_db_notok(first_opt->clusters, 0);
		return NULL;
	}

	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port,
						  &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (first_opt && !job_resp_list) {
		job_resp_list = slurm_allocate_pack_job_blocking(job_req_list,
				 first_opt->immediate, _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!job_resp_list && !_retry()) {
			break;
		}
	}

	if (job_resp_list && !destroy_job) {
		/*
		 * Allocation granted!
		 */

		opt_iter  = list_iterator_create(opt_list);
		resp_iter = list_iterator_create(job_resp_list);
		while ((opt_local = list_next(opt_iter))) {
			resp = (resource_allocation_response_msg_t *)
			       list_next(resp_iter);
			if (!resp)
				break;

			if (pending_job_id == 0)
				pending_job_id = resp->job_id;
			if (my_job_id == 0) {
				my_job_id = resp->job_id;
				i = list_count(opt_list);
				k = list_count(job_resp_list);
				if (i != k) {
					error("%s: request count != response count (%d != %d)",
					      __func__, i, k);
					goto relinquish;
				}
			}

			/*
			 * These values could be changed while the job was
			 * pending so overwrite the request with what was
			 * allocated so we don't have issues when we use them
			 * in the step creation.
			 *
			 * NOTE: pn_min_memory here is an int64, not uint64.
			 * These operations may have some bizarre side effects
			 */
			if (opt_local->pn_min_memory != NO_VAL64)
				opt_local->pn_min_memory =
					(resp->pn_min_memory & (~MEM_PER_CPU));
			else if (opt_local->mem_per_cpu != NO_VAL64)
				opt_local->mem_per_cpu =
					(resp->pn_min_memory & (~MEM_PER_CPU));

#ifdef HAVE_BG
			uint32_t node_cnt = 0;
			select_g_select_jobinfo_get(resp->select_jobinfo,
						    SELECT_JOBDATA_NODE_CNT,
						    &node_cnt);
			if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
				opt_local->min_nodes = node_cnt;
				opt_local->max_nodes = node_cnt;
			} /* else we just use the original request */

			if (!_wait_bluegene_block_ready(resp)) {
				if (!destroy_job)
					error("Something is wrong with the "
					      "boot of the block.");
				goto relinquish;
			}
#else
			opt_local->min_nodes = resp->node_cnt;
			opt_local->max_nodes = resp->node_cnt;

			if (resp->working_cluster_rec)
				slurm_setup_remote_working_cluster(resp);

			if (!_wait_nodes_ready(resp)) {
				if (!destroy_job)
					error("Something is wrong with the "
					      "boot of the nodes.");
				goto relinquish;
			}
#endif
		}
		list_iterator_destroy(resp_iter);
		list_iterator_destroy(opt_iter);
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	return job_resp_list;

relinquish:
	if (job_resp_list) {
		if (!destroy_job && my_job_id)
			slurm_complete_job(my_job_id, 1);
		list_destroy(job_resp_list);
	}
	exit(error_exit);
	return NULL;
}
Example #13
0
/*
 * Allocate nodes from the slurm controller -- retrying the attempt
 * if the controller appears to be down, and optionally waiting for
 * resources if none are currently available (see opt.immediate)
 *
 * Returns a pointer to a resource_allocation_response_msg which must
 * be freed with slurm_free_resource_allocation_response_msg()
 */
extern resource_allocation_response_msg_t *
	allocate_nodes(bool handle_signals, slurm_opt_t *opt_local)

{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j;
	slurm_allocation_callbacks_t callbacks;
	int i;

	xassert(srun_opt);

	if (srun_opt->relative_set && srun_opt->relative)
		fatal("--relative option invalid for job allocation request");

	if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL)
		return NULL;

	if (opt_local->clusters &&
	    (slurmdb_get_first_avail_cluster(j, opt_local->clusters,
					     &working_cluster_rec)
	     != SLURM_SUCCESS)) {
		print_db_notok(opt_local->clusters, 0);
		return NULL;
	}

	j->origin_cluster = xstrdup(slurmctld_conf.cluster_name);

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt_local->jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt_local->jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j,
							 opt_local->immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp)
		print_multi_line_string(resp->job_submit_user_msg, -1);

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;

		/*
		 * These values could be changed while the job was
		 * pending so overwrite the request with what was
		 * allocated so we don't have issues when we use them
		 * in the step creation.
		 */
		opt_local->pn_min_memory = NO_VAL64;
		opt_local->mem_per_cpu   = NO_VAL64;
		if (resp->pn_min_memory != NO_VAL64) {
			if (resp->pn_min_memory & MEM_PER_CPU) {
				opt_local->mem_per_cpu = (resp->pn_min_memory &
							 (~MEM_PER_CPU));
			} else {
				opt_local->pn_min_memory = resp->pn_min_memory;
			}
		}

#ifdef HAVE_BG
		uint32_t node_cnt = 0;
		select_g_select_jobinfo_get(resp->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
		if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
			opt_local->min_nodes = node_cnt;
			opt_local->max_nodes = node_cnt;
		} /* else we just use the original request */

		if (!_wait_bluegene_block_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		opt_local->min_nodes = resp->node_cnt;
		opt_local->max_nodes = resp->node_cnt;

		if (resp->working_cluster_rec)
			slurm_setup_remote_working_cluster(resp);

		if (!_wait_nodes_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:
	if (resp) {
		if (!destroy_job)
			slurm_complete_job(resp->job_id, 1);
		slurm_free_resource_allocation_response_msg(resp);
	}
	exit(error_exit);
	return NULL;
}