コード例 #1
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
int slurm_job_will_run(job_desc_msg_t *req)
{
	will_run_response_msg_t *will_run_resp = NULL;
	char buf[64], local_hostname[64];
	int rc;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *type = "processors";
	char *cluster_name = NULL;
	void *ptr = NULL;

	if ((req->alloc_node == NULL) &&
	    (gethostname_short(local_hostname, sizeof(local_hostname)) == 0)) {
		req->alloc_node = local_hostname;
	}

	if (working_cluster_rec)
		cluster_name = working_cluster_rec->name;
	else
		cluster_name = slurmctld_conf.cluster_name;
	if (!slurm_load_federation(&ptr) &&
	    cluster_in_federation(ptr, cluster_name))
		rc = _fed_job_will_run(req, &will_run_resp, ptr);
	else
		rc = slurm_job_will_run2(req, &will_run_resp);

	if ((rc == 0) && will_run_resp) {
		if (cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		info("Job %u to start at %s using %u %s on %s",
		     will_run_resp->job_id, buf,
		     will_run_resp->proc_cnt, type,
		     will_run_resp->node_list);
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			list_iterator_destroy(itr);
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
	}

	if (req->alloc_node == local_hostname)
		req->alloc_node = NULL;
	if (ptr)
		slurm_destroy_federation_rec(ptr);

	return rc;
}
コード例 #2
0
ファイル: allocate.c プロジェクト: jabl/slurm
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
int slurm_job_will_run (job_desc_msg_t *req)
{
	will_run_response_msg_t *will_run_resp = NULL;
	char buf[64];
	bool host_set = false;
	int rc;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *type = "processors";

	if ((req->alloc_node == NULL) &&
	    (gethostname_short(buf, sizeof(buf)) == 0)) {
		req->alloc_node = buf;
		host_set = true;
	}

	rc = slurm_job_will_run2(req, &will_run_resp);

	if ((rc == 0) && will_run_resp) {
		if (cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		info("Job %u to start at %s using %u %s"
		     " on %s",
		     will_run_resp->job_id, buf,
		     will_run_resp->proc_cnt, type,
		     will_run_resp->node_list);
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			list_iterator_destroy(itr);
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
	}

	if (host_set)
		req->alloc_node = NULL;

	return rc;
}
コード例 #3
0
ファイル: allocate.c プロジェクト: alepharchives/slurm
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
int slurm_job_will_run (job_desc_msg_t *req)
{
	slurm_msg_t req_msg, resp_msg;
	will_run_response_msg_t *will_run_resp;
	char buf[64];
	bool host_set = false;
	int rc;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *type = "processors";
	/* req.immediate = true;    implicit */
	if ((req->alloc_node == NULL) &&
	    (gethostname_short(buf, sizeof(buf)) == 0)) {
		req->alloc_node = buf;
		host_set = true;
	}
	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (host_set)
		req->alloc_node = NULL;

	if (rc < 0)
		return SLURM_SOCKET_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_PROTOCOL_ERROR;
		break;
	case RESPONSE_JOB_WILL_RUN:
		if(cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		will_run_resp = (will_run_response_msg_t *) resp_msg.data;
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		info("Job %u to start at %s using %u %s"
		     " on %s",
		     will_run_resp->job_id, buf,
		     will_run_resp->proc_cnt, type,
		     will_run_resp->node_list);
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #4
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
/*
 * slurm_pack_job_will_run - determine if a heterogenous job would execute
 *	immediately if submitted now
 * IN job_req_list - List of job_desc_msg_t structures describing the resource
 *		allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
extern int slurm_pack_job_will_run(List job_req_list)
{
	job_desc_msg_t *req;
	will_run_response_msg_t *will_run_resp;
	char buf[64], local_hostname[64] = "", *sep = "";
	int rc = SLURM_SUCCESS;
	char *type = "processors";
	ListIterator iter, itr;
	time_t first_start = (time_t) 0;
	uint32_t first_job_id = 0, tot_proc_count = 0, *job_id_ptr;
	hostset_t hs = NULL;
	char *job_list = NULL;

	if (!job_req_list || (list_count(job_req_list) == 0)) {
		error("No job descriptors input");
		return SLURM_ERROR;
	}

	(void) gethostname_short(local_hostname, sizeof(local_hostname));
	iter = list_iterator_create(job_req_list);
	while ((req = (job_desc_msg_t *) list_next(iter))) {
		if ((req->alloc_node == NULL) && local_hostname[0])
			req->alloc_node = local_hostname;

		will_run_resp = NULL;
		rc = slurm_job_will_run2(req, &will_run_resp);
		if ((rc == SLURM_SUCCESS) && will_run_resp) {
			if (first_job_id == 0)
				first_job_id = will_run_resp->job_id;
			if ((first_start == 0) ||
			    (first_start < will_run_resp->start_time))
				first_start = will_run_resp->start_time;
			tot_proc_count += will_run_resp->proc_cnt;
			if (hs)
				hostset_insert(hs, will_run_resp->node_list);
			else
				hs = hostset_create(will_run_resp->node_list);

			if (will_run_resp->preemptee_job_id) {
				itr = list_iterator_create(will_run_resp->
							   preemptee_job_id);
				while ((job_id_ptr = list_next(itr))) {
					if (job_list)
						sep = ",";
					xstrfmtcat(job_list, "%s%u", sep,
						   *job_id_ptr);
				}
				list_iterator_destroy(itr);
			}

			slurm_free_will_run_response_msg(will_run_resp);
		}
		if (req->alloc_node == local_hostname)
			req->alloc_node = NULL;
		if (rc != SLURM_SUCCESS)
			break;
	}
	list_iterator_destroy(iter);


	if (rc == SLURM_SUCCESS) {
		uint32_t cluster_flags = slurmdb_setup_cluster_flags();
		char node_list[1028] = "";

		if (cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		if (hs)
			hostset_ranged_string(hs, sizeof(node_list), node_list);
		slurm_make_time_str(&first_start, buf, sizeof(buf));
		info("Job %u to start at %s using %u %s on %s",
		     first_job_id, buf, tot_proc_count, type, node_list);
		if (job_list)
			info("  Preempts: %s", job_list);
	}

	if (hs)
		hostset_destroy(hs);
	xfree(job_list);

	return rc;
}
コード例 #5
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
static int _fed_job_will_run(job_desc_msg_t *req,
			     will_run_response_msg_t **will_run_resp,
			     slurmdb_federation_rec_t *fed)
{
	List resp_msg_list;
	int pthread_count = 0, i;
	pthread_t *load_thread = 0;
	load_willrun_req_struct_t *load_args;
	pthread_attr_t load_attr;
	ListIterator iter;
	will_run_response_msg_t *earliest_resp = NULL;
	load_willrun_resp_struct_t *tmp_resp;
	slurmdb_cluster_rec_t *cluster;

	xassert(req);
	xassert(will_run_resp);

	slurm_attr_init(&load_attr);

	*will_run_resp = NULL;

	/* Spawn one pthread per cluster to collect job information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_attr_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *)list_next(iter))) {
		int retries = 0;
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_willrun_req_struct_t));
		load_args->cluster       = cluster;
		load_args->req           = req;
		load_args->resp_msg_list = resp_msg_list;
		while (pthread_create(&load_thread[pthread_count], &load_attr,
				      _load_willrun_thread, (void *)load_args)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(10000);	/* sleep and retry */
		}
		pthread_count++;
	}
	list_iterator_destroy(iter);
	slurm_attr_destroy(&load_attr);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	iter = list_iterator_create(resp_msg_list);
	while ((tmp_resp = (load_willrun_resp_struct_t *)list_next(iter))) {
		if (!tmp_resp->willrun_resp_msg)
			slurm_seterrno(tmp_resp->rc);
		else if ((!earliest_resp) ||
			 (tmp_resp->willrun_resp_msg->start_time <
			  earliest_resp->start_time)) {
			slurm_free_will_run_response_msg(earliest_resp);
			earliest_resp = tmp_resp->willrun_resp_msg;
			tmp_resp->willrun_resp_msg = NULL;
		}

		slurm_free_will_run_response_msg(tmp_resp->willrun_resp_msg);
		xfree(tmp_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	*will_run_resp = earliest_resp;

	if (!earliest_resp)
		return SLURM_FAILURE;

	return SLURM_SUCCESS;
}
コード例 #6
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
/*
 * slurm_pack_job_will_run - determine if a heterogenous job would execute
 *	immediately if submitted now
 * IN job_req_list - List of job_desc_msg_t structures describing the resource
 *		allocation request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
extern int slurm_pack_job_will_run(List job_req_list)
{
	job_desc_msg_t *req;
	will_run_response_msg_t *will_run_resp;
	char buf[64], *sep = "";
	int rc = SLURM_SUCCESS, inx = 0;
	ListIterator iter, itr;
	time_t first_start = (time_t) 0;
	uint32_t first_job_id = 0, tot_proc_count = 0, *job_id_ptr;
	hostset_t hs = NULL;
	char *job_list = NULL;

	if (!job_req_list || (list_count(job_req_list) == 0)) {
		error("No job descriptors input");
		return SLURM_ERROR;
	}

	iter = list_iterator_create(job_req_list);
	while ((req = (job_desc_msg_t *) list_next(iter))) {
		will_run_resp = NULL;
		rc = slurm_job_will_run2(req, &will_run_resp);

		if (will_run_resp)
			print_multi_line_string(
				will_run_resp->job_submit_user_msg,
				inx, LOG_LEVEL_INFO);

		if ((rc == SLURM_SUCCESS) && will_run_resp) {
			if (first_job_id == 0)
				first_job_id = will_run_resp->job_id;
			if ((first_start == 0) ||
			    (first_start < will_run_resp->start_time))
				first_start = will_run_resp->start_time;
			tot_proc_count += will_run_resp->proc_cnt;
			if (hs)
				hostset_insert(hs, will_run_resp->node_list);
			else
				hs = hostset_create(will_run_resp->node_list);

			if (will_run_resp->preemptee_job_id) {
				itr = list_iterator_create(will_run_resp->
							   preemptee_job_id);
				while ((job_id_ptr = list_next(itr))) {
					if (job_list)
						sep = ",";
					xstrfmtcat(job_list, "%s%u", sep,
						   *job_id_ptr);
				}
				list_iterator_destroy(itr);
			}

			slurm_free_will_run_response_msg(will_run_resp);
		}
		if (rc != SLURM_SUCCESS)
			break;
		inx++;
	}
	list_iterator_destroy(iter);


	if (rc == SLURM_SUCCESS) {
		char node_list[1028] = "";

		if (hs)
			hostset_ranged_string(hs, sizeof(node_list), node_list);
		slurm_make_time_str(&first_start, buf, sizeof(buf));
		info("Job %u to start at %s using %u processors on %s",
		     first_job_id, buf, tot_proc_count, node_list);
		if (job_list)
			info("  Preempts: %s", job_list);
	}

	if (hs)
		hostset_destroy(hs);
	xfree(job_list);

	return rc;
}
コード例 #7
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
int slurm_job_will_run(job_desc_msg_t *req)
{
	will_run_response_msg_t *will_run_resp = NULL;
	char buf[64];
	int rc;
	char *cluster_name = NULL;
	void *ptr = NULL;

	if (working_cluster_rec)
		cluster_name = working_cluster_rec->name;
	else
		cluster_name = slurmctld_conf.cluster_name;
	if (!slurm_load_federation(&ptr) &&
	    cluster_in_federation(ptr, cluster_name))
		rc = _fed_job_will_run(req, &will_run_resp, ptr);
	else
		rc = slurm_job_will_run2(req, &will_run_resp);

	if (will_run_resp)
		print_multi_line_string(
			will_run_resp->job_submit_user_msg,
			-1, LOG_LEVEL_INFO);

	if ((rc == 0) && will_run_resp) {
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		if (will_run_resp->part_name) {
			info("Job %u to start at %s using %u processors on nodes %s in partition %s",
			     will_run_resp->job_id, buf,
			     will_run_resp->proc_cnt,
			     will_run_resp->node_list,
			     will_run_resp->part_name);
		} else {
			/*
			 * Partition name not provided from slurmctld v17.11
			 * or earlier. Remove this in the future.
			 */
			info("Job %u to start at %s using %u processors on nodes %s",
			     will_run_resp->job_id, buf,
			     will_run_resp->proc_cnt,
			     will_run_resp->node_list);
		}
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			list_iterator_destroy(itr);
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
	}

	if (ptr)
		slurm_destroy_federation_rec(ptr);

	return rc;
}
コード例 #8
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
static int _fed_job_will_run(job_desc_msg_t *req,
			     will_run_response_msg_t **will_run_resp,
			     slurmdb_federation_rec_t *fed)
{
	List resp_msg_list;
	int pthread_count = 0, i;
	pthread_t *load_thread = 0;
	load_willrun_req_struct_t *load_args;
	ListIterator iter;
	will_run_response_msg_t *earliest_resp = NULL;
	load_willrun_resp_struct_t *tmp_resp;
	slurmdb_cluster_rec_t *cluster;
	List req_clusters = NULL;

	xassert(req);
	xassert(will_run_resp);

	*will_run_resp = NULL;

	/*
	 * If a subset of clusters was specified then only do a will_run to
	 * those clusters, otherwise check all clusters in the federation.
	 */
	if (req->clusters && xstrcasecmp(req->clusters, "all")) {
		req_clusters = list_create(slurm_destroy_char);
		slurm_addto_char_list(req_clusters, req->clusters);
	}

	/* Spawn one pthread per cluster to collect job information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *)list_next(iter))) {
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		if (req_clusters &&
		    !list_find_first(req_clusters, slurm_find_char_in_list,
				     cluster->name))
			continue;

		load_args = xmalloc(sizeof(load_willrun_req_struct_t));
		load_args->cluster       = cluster;
		load_args->req           = req;
		load_args->resp_msg_list = resp_msg_list;
		slurm_thread_create(&load_thread[pthread_count],
				    _load_willrun_thread, load_args);
		pthread_count++;
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(req_clusters);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	iter = list_iterator_create(resp_msg_list);
	while ((tmp_resp = (load_willrun_resp_struct_t *)list_next(iter))) {
		if (!tmp_resp->willrun_resp_msg)
			slurm_seterrno(tmp_resp->rc);
		else if ((!earliest_resp) ||
			 (tmp_resp->willrun_resp_msg->start_time <
			  earliest_resp->start_time)) {
			slurm_free_will_run_response_msg(earliest_resp);
			earliest_resp = tmp_resp->willrun_resp_msg;
			tmp_resp->willrun_resp_msg = NULL;
		}

		slurm_free_will_run_response_msg(tmp_resp->willrun_resp_msg);
		xfree(tmp_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	*will_run_resp = earliest_resp;

	if (!earliest_resp)
		return SLURM_ERROR;

	return SLURM_SUCCESS;
}
コード例 #9
0
ファイル: mult_cluster.c プロジェクト: IFCA/slurm
/*
 * We don't use the api here because it does things we aren't needing
 * like printing out information and not returning times.
 */
local_cluster_rec_t *_job_will_run (job_desc_msg_t *req)
{
	slurm_msg_t req_msg, resp_msg;
	will_run_response_msg_t *will_run_resp;
	int rc;
	char buf[64];
	char *type = "processors";
	local_cluster_rec_t *local_cluster = NULL;
	/* req.immediate = true;    implicit */

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (rc < 0) {
		slurm_seterrno(SLURM_SOCKET_ERROR);
		return NULL;
	}
	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno(rc);
		break;
	case RESPONSE_JOB_WILL_RUN:
		if (working_cluster_rec->flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		will_run_resp = (will_run_response_msg_t *) resp_msg.data;
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		debug("Job %u to start at %s on cluster %s using %u %s on %s",
		      will_run_resp->job_id, buf, working_cluster_rec->name,
		      will_run_resp->proc_cnt, type,
		      will_run_resp->node_list);
		local_cluster = xmalloc(sizeof(local_cluster_rec_t));
		local_cluster->cluster_rec = working_cluster_rec;
		local_cluster->start_time = will_run_resp->start_time;
		if (will_run_resp->preemptee_job_id) {
			local_cluster->preempt_cnt =
				list_count(will_run_resp->preemptee_job_id);
			if (opt.verbose >= LOG_LEVEL_DEBUG) {
				ListIterator itr;
				uint32_t *job_id_ptr;
				char *job_list = NULL, *sep = "";
				itr = list_iterator_create(will_run_resp->
							   preemptee_job_id);
				while ((job_id_ptr = list_next(itr))) {
					if (job_list)
						sep = ",";
					xstrfmtcat(job_list, "%s%u",
						   sep, *job_id_ptr);
				}
				debug("  Preempts: %s", job_list);
				xfree(job_list);
			}
		}

		slurm_free_will_run_response_msg(will_run_resp);
		break;
	default:
		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
		return NULL;
		break;
	}
	return local_cluster;
}