示例#1
0
/*
 * load into the storage information about a job,
 * typically when it begins execution, but possibly earlier
 */
extern int jobacct_storage_g_job_start(void *db_conn,
				       struct job_record *job_ptr)
{
	if (slurm_acct_storage_init(NULL) < 0)
		return SLURM_ERROR;
	if (enforce & ACCOUNTING_ENFORCE_NO_JOBS)
		return SLURM_SUCCESS;

	/* A pending job's start_time is it's expected initiation time
	 * (changed in slurm v2.1). Rather than changing a bunch of code
	 * in the accounting_storage plugins and SlurmDBD, just clear
	 * start_time before accounting and restore it later.
	 * If an update for a job that is being requeued[hold] happens,
	 * we don't want to modify the start_time of the old record.
	 * Pending + Completing is equivalent to Requeue.
	 */
	if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) {
		int rc;
		time_t orig_start_time = job_ptr->start_time;
		job_ptr->start_time = (time_t) 0;
		rc = (*(ops.job_start))(db_conn, job_ptr);
		job_ptr->start_time = orig_start_time;
		return rc;
	}

	return (*(ops.job_start))(db_conn, job_ptr);
}
示例#2
0
extern List find_preemptable_jobs(struct job_record *job_ptr)
{
	ListIterator job_iterator;
	struct job_record *job_p;
	List preemptee_job_list = NULL;

	/* Validate the preemptor job */
	if (job_ptr == NULL) {
		error("find_preemptable_jobs: job_ptr is NULL");
		return preemptee_job_list;
	}
	if (!IS_JOB_PENDING(job_ptr)) {
		error("find_preemptable_jobs: job %u not pending",
		      job_ptr->job_id);
		return preemptee_job_list;
	}
	if (job_ptr->part_ptr == NULL) {
		error("find_preemptable_jobs: job %u has NULL partition ptr",
		      job_ptr->job_id);
		return preemptee_job_list;
	}
	if (job_ptr->part_ptr->node_bitmap == NULL) {
		error("find_preemptable_jobs: partition %s node_bitmap=NULL",
		      job_ptr->part_ptr->name);
		return preemptee_job_list;
	}

	/* Build an array of pointers to preemption candidates */
	job_iterator = list_iterator_create(job_list);
	while ((job_p = (struct job_record *) list_next(job_iterator))) {
		if (!IS_JOB_RUNNING(job_p) && !IS_JOB_SUSPENDED(job_p))
			continue;
		if ((job_p->part_ptr == NULL) ||
		    (job_p->part_ptr->priority_tier >=
		     job_ptr->part_ptr->priority_tier) ||
		    (job_p->part_ptr->preempt_mode == PREEMPT_MODE_OFF))
			continue;
		if ((job_p->node_bitmap == NULL) ||
		    (bit_overlap(job_p->node_bitmap,
				 job_ptr->part_ptr->node_bitmap) == 0))
			continue;
		if (job_ptr->details &&
		    (job_ptr->details->expanding_jobid == job_p->job_id))
			continue;

		/* This job is a preemption candidate */
		if (preemptee_job_list == NULL) {
			preemptee_job_list = list_create(NULL);
		}
		list_append(preemptee_job_list, job_p);
	}
	list_iterator_destroy(job_iterator);

	if (preemptee_job_list && youngest_order)
		list_sort(preemptee_job_list, _sort_by_youngest);
	else if (preemptee_job_list)
		list_sort(preemptee_job_list, _sort_by_prio);

	return preemptee_job_list;
}
示例#3
0
/* NOTE: if job has already completed, we append "EXITCODE=#" to
 * the state name */
static char *	_get_job_state(struct job_record *job_ptr)
{
	char *state_str;
	static char return_msg[128];

	if (IS_JOB_COMPLETING(job_ptr)) {
		/* Give configured KillWait+10 for job
		 * to clear out, then then consider job
		 * done. Moab will allocate jobs to
		 * nodes that are already Idle. */
		int age = (int) difftime(time(NULL),
			job_ptr->end_time);
		if (age < (kill_wait+10))
			return "Running";
	}

	if (IS_JOB_RUNNING(job_ptr))
		return "Running";
	if (IS_JOB_SUSPENDED(job_ptr))
		return "Suspended";
	if (IS_JOB_PENDING(job_ptr))
		return "Idle";

	if (IS_JOB_COMPLETE(job_ptr) || IS_JOB_FAILED(job_ptr))
		state_str = "Completed";
	else /* JOB_CANCELLED, JOB_TIMEOUT, JOB_NODE_FAIL, etc. */
		state_str = "Removed";
	snprintf(return_msg, sizeof(return_msg), "%s;EXITCODE=%u",
		state_str, WEXITSTATUS(job_ptr->exit_code));
	return return_msg;
}
示例#4
0
文件: print.c 项目: Cray/slurm
static int _get_node_cnt(job_info_t * job)
{
	int node_cnt = 0;

	/*  For PENDING jobs, return the maximum of the requested nodelist,
	 *   requested maximum number of nodes, or requested CPUs rounded
	 *   to nearest node.
	 *
	 *  For COMPLETING jobs, the job->nodes nodelist has already been
	 *   altered to list only the nodes still in the comp state, and
	 *   thus we count only those nodes toward the total nodes still
	 *   allocated to this job.
	 */

	if (IS_JOB_PENDING(job)) {
		node_cnt = _nodes_in_list(job->req_nodes);
		node_cnt = MAX(node_cnt, job->num_nodes);
		if ((node_cnt == 1) && (job->num_cpus > 1)
		    && job->ntasks_per_node
		    && (job->ntasks_per_node != (uint16_t) NO_VAL)) {
			int num_tasks = job->num_cpus;
			if (job->cpus_per_task != (uint16_t) NO_VAL)
				num_tasks /= job->cpus_per_task;
			node_cnt = (num_tasks + 1) / job->ntasks_per_node;
			if (node_cnt > num_tasks)
				node_cnt = num_tasks;
			else if (!node_cnt)
				node_cnt = 1;
		}
	} else
		node_cnt = _nodes_in_list(job->nodes);
	return node_cnt;
}
示例#5
0
文件: print.c 项目: mrhaoji/slurm
int _print_job_job_id(job_info_t * job, int width, bool right, char* suffix)
{
	if (job == NULL) {	/* Print the Header instead */
		_print_str("JOBID", width, right, true);
	} else if ((job->array_task_id != NO_VAL) &&
		   !params.array_flag && IS_JOB_PENDING(job)  &&
		   job->node_inx) {
		uint32_t i, max_task_id = 0;
		char id[FORMAT_STRING_SIZE], task_str[FORMAT_STRING_SIZE];
		bitstr_t *task_bits;
		for (i = 1; i <= job->node_inx[0]; i++)
			max_task_id = MAX(max_task_id, job->node_inx[i]);
		task_bits = bit_alloc(max_task_id + 1);
		for (i = 1; i <= job->node_inx[0]; i++)
			bit_set(task_bits, job->node_inx[i]);
		bit_fmt(task_str, sizeof(task_str), task_bits);
		snprintf(id, FORMAT_STRING_SIZE, "%u_[%s]",
			 job->array_job_id, task_str);
		_print_str(id, width, right, true);
		bit_free(task_bits);
	} else if (job->array_task_id != NO_VAL) {
		char id[FORMAT_STRING_SIZE];
		snprintf(id, FORMAT_STRING_SIZE, "%u_%u",
			 job->array_job_id, job->array_task_id);
		_print_str(id, width, right, true);
	} else {
		char id[FORMAT_STRING_SIZE];
		snprintf(id, FORMAT_STRING_SIZE, "%u", job->job_id);
		_print_str(id, width, right, true);
	}
	if (suffix)
		printf("%s", suffix);
	return SLURM_SUCCESS;
}
/* Code taken from job_info.c calculate cummulative run time for a job */
static time_t _get_job_runtime(struct job_record *job_ptr)
{
	time_t end_time, run_time;

	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				   (difftime(end_time, job_ptr->suspend_time)
				    + job_ptr->pre_sus_time);
		} else {
			run_time = (time_t)
				   difftime(end_time, job_ptr->start_time);
		}
	}

	return run_time;
}
示例#7
0
/*
 * srun_user_message - Send arbitrary message to an srun job (no job steps)
 */
extern int srun_user_message(struct job_record *job_ptr, char *msg)
{
	slurm_addr_t * addr;
	srun_user_msg_t *msg_arg;

	xassert(job_ptr);
	if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (job_ptr->other_port &&
	    job_ptr->resp_host && job_ptr->resp_host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_user_msg_t));
		msg_arg->job_id = job_ptr->job_id;
		msg_arg->msg    = xstrdup(msg);
		_srun_agent_launch(addr, job_ptr->resp_host, SRUN_USER_MSG,
				   msg_arg);
		return SLURM_SUCCESS;
	} else if (job_ptr->batch_flag && IS_JOB_RUNNING(job_ptr)) {
#ifndef HAVE_FRONT_END
		struct node_record *node_ptr;
#endif
		job_notify_msg_t *notify_msg_ptr;
		agent_arg_t *agent_arg_ptr;
#ifdef HAVE_FRONT_END
		if (job_ptr->batch_host == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(job_ptr->batch_host);
#else
		node_ptr = find_first_node_record(job_ptr->node_bitmap);
		if (node_ptr == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(node_ptr->name);
#endif
		if (agent_arg_ptr->hostlist == NULL)
			fatal("hostlist_create: malloc failure");
		notify_msg_ptr = (job_notify_msg_t *) 
				 xmalloc(sizeof(job_notify_msg_t));
		notify_msg_ptr->job_id = job_ptr->job_id;
		notify_msg_ptr->message = xstrdup(msg);
		agent_arg_ptr->node_count = 1;
		agent_arg_ptr->retry = 0;
		agent_arg_ptr->msg_type = REQUEST_JOB_NOTIFY;
		agent_arg_ptr->msg_args = (void *) notify_msg_ptr;
		/* Launch the RPC via agent */
		agent_queue_request(agent_arg_ptr);
		return SLURM_SUCCESS;
	}
	return ESLURM_DISABLED;
}
示例#8
0
文件: print.c 项目: Cray/slurm
static bool _merge_job_array(List l, job_info_t * job_ptr)
{
	job_info_t *list_job_ptr;
	ListIterator iter;
	bool merge = false;

	if (params.array_flag)
		return merge;
	if (job_ptr->array_task_id == NO_VAL)
		return merge;
	if (!IS_JOB_PENDING(job_ptr))
		return merge;
	xfree(job_ptr->node_inx);
	if (!l)
		return merge;

	iter = list_iterator_create(l);
	while ((list_job_ptr = list_next(iter))) {
		if ((list_job_ptr->array_task_id ==  NO_VAL) ||
		    (job_ptr->array_job_id != list_job_ptr->array_job_id) ||
		    (!IS_JOB_PENDING(list_job_ptr)))
			continue;
		/* We re-purpose the job's node_inx array to store the
		 * array_task_id values */
		if (!list_job_ptr->node_inx) {
			list_job_ptr->node_inx = xmalloc(sizeof(int32_t) * 0xffff);
			list_job_ptr->node_inx[0] = 1;		/* offset */
			list_job_ptr->node_inx[1] =
				list_job_ptr->array_task_id;
		}
		list_job_ptr->node_inx[0]++;
		list_job_ptr->node_inx[list_job_ptr->node_inx[0]] =
				job_ptr->array_task_id;
		merge = true;
		break;
	}
	list_iterator_destroy(iter);

	return merge;
}
示例#9
0
文件: fair_tree.c 项目: mart1nl/slurm
/* Apply usage with decay factor. Call standard functions */
static void _ft_decay_apply_new_usage(struct job_record *job, time_t *start)
{
	if (!decay_apply_new_usage(job, start))
		return;

	/* Priority 0 is reserved for held jobs. Also skip priority
	 * calculation for non-pending jobs. */
	if ((job->priority == 0) || !IS_JOB_PENDING(job))
		return;

	set_priority_factors(*start, job);
	last_job_update = time(NULL);
}
示例#10
0
extern int job_modify(struct job_descriptor *job_desc,
		      struct job_record *job_ptr, uint32_t submit_uid)
{
	uint16_t job_mcdram, job_numa;
	int mcdram_cnt, numa_cnt;
	char *tmp_str;

	if (!job_desc->features)
		return SLURM_SUCCESS;
	if (!IS_JOB_PENDING(job_ptr))
		return ESLURM_JOB_NOT_PENDING;

	job_mcdram = knl_mcdram_parse(job_desc->features, "&");
	mcdram_cnt = knl_mcdram_bits_cnt(job_mcdram);
	if (mcdram_cnt > 1) {			/* Multiple MCDRAM options */
		return ESLURM_INVALID_KNL;
	} else if (mcdram_cnt == 0) {
		if (job_desc->features && job_desc->features[0])
			xstrcat(job_desc->features, "&");
		tmp_str = knl_mcdram_str(default_mcdram);
		xstrcat(job_desc->features, tmp_str);
		xfree(tmp_str);
	} else if ((job_mcdram & avail_mcdram) == 0) { /* Unavailable option */
		return ESLURM_INVALID_KNL;
	}

	job_numa = knl_numa_parse(job_desc->features, "&");
	numa_cnt = knl_numa_bits_cnt(job_numa);
	if (numa_cnt > 1) {			/* Multiple NUMA options */
		return ESLURM_INVALID_KNL;
	} else if (numa_cnt == 0) {
		if (job_desc->features && job_desc->features[0])
			xstrcat(job_desc->features, "&");
		tmp_str = knl_numa_str(default_numa);
		xstrcat(job_desc->features, tmp_str);
		xfree(tmp_str);
	} else if ((job_numa & avail_numa) == 0) { /* Unavailable NUMA option */
		return ESLURM_INVALID_KNL;
	}

	return SLURM_SUCCESS;
}
示例#11
0
文件: print.c 项目: Cray/slurm
long job_time_used(job_info_t * job_ptr)
{
	time_t end_time;

	if ((job_ptr->start_time == 0) || IS_JOB_PENDING(job_ptr))
		return 0L;

	if (IS_JOB_SUSPENDED(job_ptr))
		return (long) job_ptr->pre_sus_time;

	if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
		end_time = time(NULL);
	else
		end_time = job_ptr->end_time;

	if (job_ptr->suspend_time)
		return (long) (difftime(end_time, job_ptr->suspend_time)
				+ job_ptr->pre_sus_time);
	return (long) (difftime(end_time, job_ptr->start_time));
}
示例#12
0
/*
 * load into the storage information about a job,
 * typically when it begins execution, but possibly earlier
 */
extern int jobacct_storage_g_job_start(void *db_conn,
				       struct job_record *job_ptr)
{
	if (slurm_acct_storage_init(NULL) < 0)
		return SLURM_ERROR;

	/* A pending job's start_time is it's expected initiation time
	 * (changed in slurm v2.1). Rather than changing a bunch of code
	 * in the accounting_storage plugins and SlurmDBD, just clear
	 * start_time before accounting and restore it later. */
	if (IS_JOB_PENDING(job_ptr)) {
		int rc;
		time_t orig_start_time = job_ptr->start_time;
		job_ptr->start_time = (time_t) 0;
		rc = (*(g_acct_storage_context->ops.job_start))(
			db_conn, job_ptr);
		job_ptr->start_time = orig_start_time;
		return rc;
	}

	return (*(g_acct_storage_context->ops.job_start))(db_conn, job_ptr);
}
示例#13
0
文件: print.c 项目: Cray/slurm
int _print_job_reason_list(job_info_t * job, int width, bool right,
		char* suffix)
{
	if (job == NULL) {	/* Print the Header instead */
		char *title = "NODELIST(REASON)";
		if (params.cluster_flags & CLUSTER_FLAG_BG)
			title = "MIDPLANELIST(REASON)";
		_print_str(title, width, right, false);
	} else if (!IS_JOB_COMPLETING(job)
		   && (IS_JOB_PENDING(job)
		       || IS_JOB_TIMEOUT(job)
		       || IS_JOB_FAILED(job))) {
		char id[FORMAT_STRING_SIZE], *reason;
		if (job->state_desc)
			reason = job->state_desc;
		else
			reason = job_reason_string(job->state_reason);
		snprintf(id, FORMAT_STRING_SIZE, "(%s)", reason);
		_print_str(id, width, right, true);
	} else {
		char *nodes = xstrdup(job->nodes);
		char *ionodes = NULL;

		select_g_select_jobinfo_get(job->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
		if (ionodes) {
			xstrfmtcat(nodes, "[%s]", ionodes);
			xfree(ionodes);
			_print_str(nodes, width, right, false);
		} else
			_print_nodes(nodes, width, right, false);
		xfree(nodes);
	}
	if (suffix)
		printf("%s", suffix);
	return SLURM_SUCCESS;
}
示例#14
0
static bool _pending_pack_jobs(struct job_record *job_ptr)
{
	struct job_record *pack_leader, *pack_job;
	ListIterator iter;
	bool pending_job = false;

	if (job_ptr->pack_job_id == 0)
		return false;

	pack_leader = find_job_record(job_ptr->pack_job_id);
	if (!pack_leader) {
		error("Job pack leader %pJ not found", job_ptr);
		return false;
	}
	if (!pack_leader->pack_job_list) {
		error("Job pack leader %pJ lacks pack_job_list",
		      job_ptr);
		return false;
	}

	iter = list_iterator_create(pack_leader->pack_job_list);
	while ((pack_job = (struct job_record *) list_next(iter))) {
		if (pack_leader->pack_job_id != pack_job->pack_job_id) {
			error("%s: Bad pack_job_list for %pJ",
			      __func__, pack_leader);
			continue;
		}
		if (IS_JOB_PENDING(pack_job)) {
			pending_job = true;
			break;
		}
	}
	list_iterator_destroy(iter);

	return pending_job;
}
示例#15
0
文件: print.c 项目: IFCA/slurm
static int _get_node_cnt(job_info_t * job)
{
	int node_cnt = 0, round;

	/*  For PENDING jobs, return the maximum of the requested nodelist,
	 *   requested maximum number of nodes, or requested CPUs rounded
	 *   to nearest node.
	 *
	 *  For COMPLETING jobs, the job->nodes nodelist has already been
	 *   altered to list only the nodes still in the comp state, and
	 *   thus we count only those nodes toward the total nodes still
	 *   allocated to this job.
	 */

	if (IS_JOB_PENDING(job)) {
		node_cnt = _nodes_in_list(job->req_nodes);
		node_cnt = MAX(node_cnt, job->num_nodes);
		round  = job->num_cpus + params.max_cpus - 1;
		round /= params.max_cpus;	/* round up */
		node_cnt = MAX(node_cnt, round);
	} else
		node_cnt = _nodes_in_list(job->nodes);
	return node_cnt;
}
示例#16
0
文件: acct_policy.c 项目: VURM/slurm
/*
 * acct_policy_update_pending_job - Make sure the limits imposed on a
 *	job on submission are correct after an update to a qos or
 *	association.  If the association/qos limits prevent
 *	the job from ever running (lowered limits since job submission),
 *	then cancel the job.
 */
extern int acct_policy_update_pending_job(struct job_record *job_ptr)
{
	job_desc_msg_t job_desc;
	uint16_t limit_set_max_cpus = 0;
	uint16_t limit_set_max_nodes = 0;
	uint16_t limit_set_time = 0;
	bool update_accounting = false;
	struct job_details *details_ptr;
	int rc = SLURM_SUCCESS;

	/* check to see if we are enforcing associations and the job
	 * is pending or if we are even enforcing limits. */
	if (!accounting_enforce || !IS_JOB_PENDING(job_ptr)
	    || !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
		return SLURM_SUCCESS;

	details_ptr = job_ptr->details;

	if (!details_ptr) {
		error("acct_policy_update_pending_job: no details");
		return SLURM_ERROR;
	}

	/* set up the job desc to make sure things are the way we
	 * need.
	 */
	slurm_init_job_desc_msg(&job_desc);

	job_desc.min_cpus = details_ptr->min_cpus;
	/* Only set this value if not set from a limit */
	if (job_ptr->limit_set_max_cpus == ADMIN_SET_LIMIT)
		limit_set_max_cpus = job_ptr->limit_set_max_cpus;
	else if ((details_ptr->max_cpus != NO_VAL)
		 && !job_ptr->limit_set_max_cpus)
		job_desc.max_cpus = details_ptr->max_cpus;

	job_desc.min_nodes = details_ptr->min_nodes;
	/* Only set this value if not set from a limit */
	if (job_ptr->limit_set_max_nodes == ADMIN_SET_LIMIT)
		limit_set_max_nodes = job_ptr->limit_set_max_nodes;
	else if ((details_ptr->max_nodes != NO_VAL)
		 && !job_ptr->limit_set_max_nodes)
		job_desc.max_nodes = details_ptr->max_nodes;
	else
		job_desc.max_nodes = 0;

	/* Only set this value if not set from a limit */
	if (job_ptr->limit_set_time == ADMIN_SET_LIMIT)
		limit_set_time = job_ptr->limit_set_time;
	else if ((job_ptr->time_limit != NO_VAL) && !job_ptr->limit_set_time)
		job_desc.time_limit = job_ptr->time_limit;

	if (!acct_policy_validate(&job_desc, job_ptr->part_ptr,
				  job_ptr->assoc_ptr, job_ptr->qos_ptr,
				  &limit_set_max_cpus,
				  &limit_set_max_nodes,
				  &limit_set_time, 0)) {
		info("acct_policy_update_pending_job: exceeded "
		     "association/qos's cpu, node or "
		     "time limit for job %d", job_ptr->job_id);
		_cancel_job(job_ptr);
		return SLURM_ERROR;
	}

	/* If it isn't an admin set limit replace it. */
	if (!limit_set_max_cpus && (job_ptr->limit_set_max_cpus == 1)) {
		details_ptr->max_cpus = NO_VAL;
		job_ptr->limit_set_max_cpus = 0;
		update_accounting = true;
	} else if (limit_set_max_cpus != ADMIN_SET_LIMIT) {
		if (details_ptr->max_cpus != job_desc.max_cpus) {
			details_ptr->max_cpus = job_desc.max_cpus;
			update_accounting = true;
		}
		job_ptr->limit_set_max_cpus = limit_set_max_cpus;
	}

	if (!limit_set_max_nodes && (job_ptr->limit_set_max_nodes == 1)) {
		details_ptr->max_nodes = 0;
		job_ptr->limit_set_max_nodes = 0;
		update_accounting = true;
	} else if (limit_set_max_nodes != ADMIN_SET_LIMIT) {
		if (details_ptr->max_nodes != job_desc.max_nodes) {
			details_ptr->max_nodes = job_desc.max_nodes;
			update_accounting = true;
		}
		job_ptr->limit_set_max_nodes = limit_set_max_nodes;
	}

	if (!limit_set_time && (job_ptr->limit_set_time == 1)) {
		job_ptr->time_limit = NO_VAL;
		job_ptr->limit_set_time = 0;
		update_accounting = true;
	} else if (limit_set_time != ADMIN_SET_LIMIT) {
		if (job_ptr->time_limit != job_desc.time_limit) {
			job_ptr->time_limit = job_desc.time_limit;
			update_accounting = true;
		}
		job_ptr->limit_set_time = limit_set_time;
	}

	if (update_accounting) {
		last_job_update = time(NULL);
		debug("limits changed for job %u: updating accounting",
		      job_ptr->job_id);
		if (details_ptr->begin_time) {
			/* Update job record in accounting to reflect changes */
			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
		}
	}

	return rc;
}
示例#17
0
static char *	_will_run_test2(uint32_t jobid, time_t start_time,
				char *node_list,
				uint32_t *preemptee, int preemptee_cnt,
				int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL, *pre_ptr;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t start_res;
	uint32_t min_nodes, max_nodes, req_nodes;
	List preemptee_candidates = NULL, preempted_jobs = NULL;
	time_t orig_start_time;
	char *reply_msg = NULL;
	int i, rc;
	bool resv_overlap = false;

	xassert(node_list);
	debug2("wiki2: will_run2 job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap, &resv_overlap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (preemptee_cnt) {
		preemptee_candidates = list_create(NULL);
		for (i=0; i<preemptee_cnt; i++) {
			if ((pre_ptr = find_job_record(preemptee[i])))
				list_append(preemptee_candidates, pre_ptr);
		}
	}

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes,
			       req_nodes, SELECT_MODE_WILL_RUN,
			       preemptee_candidates, &preempted_jobs,
			       exc_core_bitmap);
	FREE_NULL_LIST(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char *hostlist, *sep, tmp_str[128];
		uint32_t pre_cnt = 0, proc_cnt = 0;

#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
				     SELECT_JOBDATA_NODE_CNT, &proc_cnt);
#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str),
			 "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=",
			 job_ptr->job_id, proc_cnt,
			 (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);

		if (preempted_jobs) {
			while ((pre_ptr = list_pop(preempted_jobs))) {
				if (pre_cnt++)
					sep = ",";
				else
					sep = " PREEMPT=";
				snprintf(tmp_str, sizeof(tmp_str), "%s%u",
					 sep, pre_ptr->job_id);
				xstrcat(reply_msg, tmp_str);
			}
			FREE_NULL_LIST(preempted_jobs);
		}
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;

	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	return reply_msg;
}
示例#18
0
extern void get_job(void)
{
	int error_code = -1, i, recs;
	static int printed_jobs = 0;
	static int count = 0;
	static job_info_msg_t *job_info_ptr = NULL, *new_job_ptr = NULL;
	job_info_t *job_ptr = NULL;
	uint16_t show_flags = 0;
	bitstr_t *nodes_req = NULL;
	static uint16_t last_flags = 0;

	if (params.all_flag)
		show_flags |= SHOW_ALL;
	if (job_info_ptr) {
		if (show_flags != last_flags)
			job_info_ptr->last_update = 0;
		error_code = slurm_load_jobs(job_info_ptr->last_update,
					     &new_job_ptr, show_flags);
		if (error_code == SLURM_SUCCESS)
			slurm_free_job_info_msg(job_info_ptr);
		else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) {
			error_code = SLURM_SUCCESS;
			new_job_ptr = job_info_ptr;
		}
	} else
		error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr,
					     show_flags);

	last_flags = show_flags;
	if (error_code) {
		if (quiet_flag != 1) {
			if (!params.commandline) {
				mvwprintw(text_win,
					  main_ycord, 1,
					  "slurm_load_jobs: %s",
					  slurm_strerror(slurm_get_errno()));
				main_ycord++;
			} else {
				printf("slurm_load_jobs: %s\n",
				       slurm_strerror(slurm_get_errno()));
			}
		}
	}

	if (!params.no_header)
		_print_header_job();

	if (new_job_ptr)
		recs = new_job_ptr->record_count;
	else
		recs = 0;

	if (!params.commandline)
		if ((text_line_cnt+printed_jobs) > count)
			text_line_cnt--;
	printed_jobs = 0;
	count = 0;

	if (params.hl)
		nodes_req = get_requested_node_bitmap();
	for (i = 0; i < recs; i++) {
		job_ptr = &(new_job_ptr->job_array[i]);
		if (!IS_JOB_PENDING(job_ptr)   && !IS_JOB_RUNNING(job_ptr) &&
		    !IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
			continue;	/* job has completed */
		if (nodes_req) {
			int overlap = 0;
			bitstr_t *loc_bitmap = bit_alloc(bit_size(nodes_req));
			inx2bitstr(loc_bitmap, job_ptr->node_inx);
			overlap = bit_overlap(loc_bitmap, nodes_req);
			FREE_NULL_BITMAP(loc_bitmap);
			if (!overlap)
				continue;
		}

		if (job_ptr->node_inx[0] != -1) {
			int j = 0;
			job_ptr->num_nodes = 0;
			while (job_ptr->node_inx[j] >= 0) {
				job_ptr->num_nodes +=
					(job_ptr->node_inx[j + 1] + 1) -
					 job_ptr->node_inx[j];
				set_grid_inx(job_ptr->node_inx[j],
					     job_ptr->node_inx[j + 1], count);
				j += 2;
			}

			if (!params.commandline) {
				if ((count >= text_line_cnt) &&
				    (printed_jobs < (getmaxy(text_win) - 4))) {
					job_ptr->num_cpus =
						(int)letters[count%62];
					wattron(text_win,
						COLOR_PAIR(colors[count%6]));
					_print_text_job(job_ptr);
					wattroff(text_win,
						 COLOR_PAIR(colors[count%6]));
					printed_jobs++;
				}
			} else {
				job_ptr->num_cpus = (int)letters[count%62];
				_print_text_job(job_ptr);
			}
			count++;
		}
		if (count == 128)
			count = 0;
	}

	for (i = 0; i < recs; i++) {
		job_ptr = &(new_job_ptr->job_array[i]);

		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* job has completed */

		if (!params.commandline) {
			if ((count>=text_line_cnt) &&
			    (printed_jobs < (getmaxy(text_win) - 4))) {
				xfree(job_ptr->nodes);
				job_ptr->nodes = xstrdup("waiting...");
				job_ptr->num_cpus = (int) letters[count%62];
				wattron(text_win,
					COLOR_PAIR(colors[count%6]));
				_print_text_job(job_ptr);
				wattroff(text_win,
					 COLOR_PAIR(colors[count%6]));
				printed_jobs++;
			}
		} else {
			xfree(job_ptr->nodes);
			job_ptr->nodes = xstrdup("waiting...");
			job_ptr->num_cpus = (int) letters[count%62];
			_print_text_job(job_ptr);
			printed_jobs++;
		}
		count++;

		if (count == 128)
			count = 0;
	}

	if (params.commandline && params.iterate)
		printf("\n");

	if (!params.commandline)
		main_ycord++;

	job_info_ptr = new_job_ptr;
	return;
}
示例#19
0
文件: job_info.c 项目: IFCA/slurm
/*
 * slurm_sprint_job_info - output information about a specific Slurm
 *	job based upon message as loaded using slurm_load_jobs
 * IN job_ptr - an individual job information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
extern char *
slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
{
	int i, j;
	char time_str[32], *group_name, *user_name;
	char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr;
	char tmp_line[512];
	char *ionodes = NULL;
	uint16_t exit_status = 0, term_sig = 0;
	job_resources_t *job_resrcs = job_ptr->job_resrcs;
	char *out = NULL;
	time_t run_time;
	uint32_t min_nodes, max_nodes = 0;
	char *nodelist = "NodeList";
	bitstr_t *core_bitmap;
	char *host;
	int sock_inx, sock_reps, last;
	int abs_node_inx, rel_node_inx;
	int bit_inx, bit_reps;
	uint32_t *last_mem_alloc_ptr = NULL;
	uint32_t last_mem_alloc = NO_VAL;
	char *last_hosts;
	hostlist_t hl, hl_last;
	char select_buf[122];
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (cluster_flags & CLUSTER_FLAG_BG) {
		nodelist = "MidplaneList";
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
	}

	/****** Line 1 ******/
	snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id);
	out = xstrdup(tmp_line);
	if (job_ptr->array_job_id) {
		snprintf(tmp_line, sizeof(tmp_line), 
			 "ArrayJobId=%u ArrayTaskId=%u ",
			 job_ptr->array_job_id, job_ptr->array_task_id);
		xstrcat(out, tmp_line);
	}
	snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 2 ******/
	user_name = uid_to_string((uid_t) job_ptr->user_id);
	group_name = gid_to_string((gid_t) job_ptr->group_id);
	snprintf(tmp_line, sizeof(tmp_line),
		 "UserId=%s(%u) GroupId=%s(%u)",
		 user_name, job_ptr->user_id, group_name, job_ptr->group_id);
	xfree(user_name);
	xfree(group_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 3 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Priority=%u Account=%s QOS=%s",
		 job_ptr->priority, job_ptr->account, job_ptr->qos);
	xstrcat(out, tmp_line);
	if (slurm_get_track_wckey()) {
		snprintf(tmp_line, sizeof(tmp_line),
			 " WCKey=%s", job_ptr->wckey);
		xstrcat(out, tmp_line);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 4 ******/
	if (job_ptr->state_desc) {
		/* Replace white space with underscore for easier parsing */
		for (j=0; job_ptr->state_desc[j]; j++) {
			if (isspace((int)job_ptr->state_desc[j]))
				job_ptr->state_desc[j] = '_';
		}
		tmp6_ptr = job_ptr->state_desc;
	} else
		tmp6_ptr = job_reason_string(job_ptr->state_reason);
	snprintf(tmp_line, sizeof(tmp_line),
		 "JobState=%s Reason=%s Dependency=%s",
		 job_state_string(job_ptr->job_state), tmp6_ptr,
		 job_ptr->dependency);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Requeue=%u Restarts=%u BatchFlag=%u ",
		 job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag);
	xstrcat(out, tmp_line);
	if (WIFSIGNALED(job_ptr->exit_code))
		term_sig = WTERMSIG(job_ptr->exit_code);
	exit_status = WEXITSTATUS(job_ptr->exit_code);
	snprintf(tmp_line, sizeof(tmp_line),
		 "ExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5a (optional) ******/
	if (!(job_ptr->show_flags & SHOW_DETAIL))
		goto line6;
	if (WIFSIGNALED(job_ptr->derived_ec))
		term_sig = WTERMSIG(job_ptr->derived_ec);
	else
		term_sig = 0;
	exit_status = WEXITSTATUS(job_ptr->derived_ec);
	snprintf(tmp_line, sizeof(tmp_line),
		 "DerivedExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 6 ******/
line6:
	snprintf(tmp_line, sizeof(tmp_line), "RunTime=");
	xstrcat(out, tmp_line);
	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		time_t end_time;
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				(difftime(end_time, job_ptr->suspend_time)
				 + job_ptr->pre_sus_time);
		} else
			run_time = (time_t)
				difftime(end_time, job_ptr->start_time);
	}
	secs2time_str(run_time, tmp1, sizeof(tmp1));
	sprintf(tmp_line, "%s ", tmp1);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "TimeLimit=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_limit == NO_VAL)
		sprintf(tmp_line, "Partition_Limit");
	else {
		mins2time_str(job_ptr->time_limit, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	snprintf(tmp_line, sizeof(tmp_line), " TimeMin=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_min == 0)
		sprintf(tmp_line, "N/A");
	else {
		mins2time_str(job_ptr->time_min, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 7 ******/
	slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str);
	xstrcat(out, tmp_line);

	slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 8 (optional) ******/
	if (job_ptr->resize_time) {
		slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str,
				    sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 9 ******/
	slurm_make_time_str((time_t *)&job_ptr->start_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "EndTime=");
	xstrcat(out, tmp_line);
	if ((job_ptr->time_limit == INFINITE) &&
	    (job_ptr->end_time > time(NULL)))
		sprintf(tmp_line, "Unknown");
	else {
		slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str,
				     sizeof(time_str));
		sprintf(tmp_line, "%s", time_str);
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 10 ******/
	if (job_ptr->preempt_time == 0)
		sprintf(tmp_line, "PreemptTime=None ");
	else {
		slurm_make_time_str((time_t *)&job_ptr->preempt_time,
				    time_str, sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ",
			 time_str);
	}
	xstrcat(out, tmp_line);
	if (job_ptr->suspend_time) {
		slurm_make_time_str ((time_t *)&job_ptr->suspend_time,
				     time_str, sizeof(time_str));
	} else {
		strncpy(time_str, "None", sizeof(time_str));
	}
	snprintf(tmp_line, sizeof(tmp_line),
		 "SuspendTime=%s SecsPreSuspend=%ld",
		 time_str, (long int)job_ptr->pre_sus_time);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 11 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Partition=%s AllocNode:Sid=%s:%u",
		 job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 12 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s",
		 nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 13 ******/
	xstrfmtcat(out, "%s=", nodelist);
	xstrcat(out, job_ptr->nodes);
	if (job_ptr->nodes && ionodes) {
		snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes);
		xstrcat(out, tmp_line);
		xfree(ionodes);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 14 (optional) ******/
	if (job_ptr->batch_host) {
		snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s",
			 job_ptr->batch_host);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 15 ******/
	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &min_nodes);
		if ((min_nodes == 0) || (min_nodes == NO_VAL)) {
			min_nodes = job_ptr->num_nodes;
			max_nodes = job_ptr->max_nodes;
		} else if (job_ptr->max_nodes)
			max_nodes = min_nodes;
	} else {
		min_nodes = job_ptr->num_nodes;
		max_nodes = job_ptr->max_nodes;
	}

	_sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus);
	_sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes);
	if (job_ptr->sockets_per_node == (uint16_t) NO_VAL)
		strcpy(tmp3, "*");
	else
		snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node);
	if (job_ptr->cores_per_socket == (uint16_t) NO_VAL)
		strcpy(tmp4, "*");
	else
		snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket);
	if (job_ptr->threads_per_core == (uint16_t) NO_VAL)
		strcpy(tmp5, "*");
	else
		snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core);
	snprintf(tmp_line, sizeof(tmp_line),
		 "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s",
		 tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	if (!job_resrcs)
		goto line15;

	if (cluster_flags & CLUSTER_FLAG_BG) {
		if ((job_resrcs->cpu_array_cnt > 0) &&
		    (job_resrcs->cpu_array_value) &&
		    (job_resrcs->cpu_array_reps)) {
			int length = 0;
			xstrcat(out, "CPUs=");
			length += 10;
			for (i = 0; i < job_resrcs->cpu_array_cnt; i++) {
				if (length > 70) {
					/* skip to last CPU group entry */
					if (i < job_resrcs->cpu_array_cnt - 1) {
						continue;
					}
					/* add ellipsis before last entry */
					xstrcat(out, "...,");
					length += 4;
				}

				snprintf(tmp_line, sizeof(tmp_line), "%d",
					 job_resrcs->cpus[i]);
				xstrcat(out, tmp_line);
				length += strlen(tmp_line);
				if (job_resrcs->cpu_array_reps[i] > 1) {
					snprintf(tmp_line, sizeof(tmp_line),
						 "*%d",
						 job_resrcs->cpu_array_reps[i]);
					xstrcat(out, tmp_line);
					length += strlen(tmp_line);
				}
				if (i < job_resrcs->cpu_array_cnt - 1) {
					xstrcat(out, ",");
					length++;
				}
			}
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
	} else {
		if (!job_resrcs->core_bitmap)
			goto line15;

		last  = bit_fls(job_resrcs->core_bitmap);
		if (last == -1)
			goto line15;

		hl = hostlist_create(job_ptr->nodes);
		if (!hl) {
			error("slurm_sprint_job_info: hostlist_create: %s",
			      job_ptr->nodes);
			return NULL;
		}
		hl_last = hostlist_create(NULL);
		if (!hl_last) {
			error("slurm_sprint_job_info: hostlist_create: NULL");
			hostlist_destroy(hl);
			return NULL;
		}

		bit_inx = 0;
		i = sock_inx = sock_reps = 0;
		abs_node_inx = job_ptr->node_inx[i];

/*	tmp1[] stores the current cpu(s) allocated	*/
		tmp2[0] = '\0';	/* stores last cpu(s) allocated */
		for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts;
		     rel_node_inx++) {

			if (sock_reps >=
			    job_resrcs->sock_core_rep_count[sock_inx]) {
				sock_inx++;
				sock_reps = 0;
			}
			sock_reps++;

			bit_reps = job_resrcs->sockets_per_node[sock_inx] *
				job_resrcs->cores_per_socket[sock_inx];

			core_bitmap = bit_alloc(bit_reps);
			for (j=0; j < bit_reps; j++) {
				if (bit_test(job_resrcs->core_bitmap, bit_inx))
					bit_set(core_bitmap, j);
				bit_inx++;
			}

			bit_fmt(tmp1, sizeof(tmp1), core_bitmap);
			FREE_NULL_BITMAP(core_bitmap);
			host = hostlist_shift(hl);
/*
 *		If the allocation values for this host are not the same as the
 *		last host, print the report of the last group of hosts that had
 *		identical allocation values.
 */
			if (strcmp(tmp1, tmp2) ||
			    (last_mem_alloc_ptr != job_resrcs->memory_allocated) ||
			    (job_resrcs->memory_allocated &&
			     (last_mem_alloc !=
			      job_resrcs->memory_allocated[rel_node_inx]))) {
				if (hostlist_count(hl_last)) {
					last_hosts = 
						hostlist_ranged_string_xmalloc(
						hl_last);
					snprintf(tmp_line, sizeof(tmp_line),
						 "  Nodes=%s CPU_IDs=%s Mem=%u",
						 last_hosts, tmp2,
						 last_mem_alloc_ptr ?
						 last_mem_alloc : 0);
					xfree(last_hosts);
					xstrcat(out, tmp_line);
					if (one_liner)
						xstrcat(out, " ");
					else
						xstrcat(out, "\n   ");

					hostlist_destroy(hl_last);
					hl_last = hostlist_create(NULL);
				}
				strcpy(tmp2, tmp1);
				last_mem_alloc_ptr = job_resrcs->memory_allocated;
				if (last_mem_alloc_ptr)
					last_mem_alloc = job_resrcs->
						memory_allocated[rel_node_inx];
				else
					last_mem_alloc = NO_VAL;
			}
			hostlist_push_host(hl_last, host);
			free(host);

			if (bit_inx > last)
				break;

			if (abs_node_inx > job_ptr->node_inx[i+1]) {
				i += 2;
				abs_node_inx = job_ptr->node_inx[i];
			} else {
				abs_node_inx++;
			}
		}

		if (hostlist_count(hl_last)) {
			last_hosts = hostlist_ranged_string_xmalloc(hl_last);
			snprintf(tmp_line, sizeof(tmp_line),
				 "  Nodes=%s CPU_IDs=%s Mem=%u",
				 last_hosts, tmp2,
				 last_mem_alloc_ptr ? last_mem_alloc : 0);
			xfree(last_hosts);
			xstrcat(out, tmp_line);
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
		hostlist_destroy(hl);
		hostlist_destroy(hl_last);
	}
	/****** Line 15 ******/
line15:
	if (job_ptr->pn_min_memory & MEM_PER_CPU) {
		job_ptr->pn_min_memory &= (~MEM_PER_CPU);
		tmp6_ptr = "CPU";
	} else
		tmp6_ptr = "Node";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)job_ptr->pn_min_cpus,
				 tmp1, sizeof(tmp1), UNIT_NONE);
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s",	tmp1);
	} else {
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u",
			 job_ptr->pn_min_cpus);
	}

	xstrcat(out, tmp_line);
	convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1),
			 UNIT_MEGA);
	convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2),
			 UNIT_MEGA);
	snprintf(tmp_line, sizeof(tmp_line),
		 " MinMemory%s=%s MinTmpDiskNode=%s",
		 tmp6_ptr, tmp1, tmp2);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 16 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Features=%s Gres=%s Reservation=%s",
		 job_ptr->features, job_ptr->gres, job_ptr->resv_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 17 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Shared=%s Contiguous=%d Licenses=%s Network=%s",
		 (job_ptr->shared == 0 ? "0" :
		  job_ptr->shared == 1 ? "1" : "OK"),
		 job_ptr->contiguous, job_ptr->licenses, job_ptr->network);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 18 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Command=%s",
		 job_ptr->command);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 19 ******/
	snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s",
		 job_ptr->work_dir);
	xstrcat(out, tmp_line);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		/****** Line 20 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_BG_ID);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "Block_ID=%s", select_buf);
			xstrcat(out, tmp_line);
		}

		/****** Line 21 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MIXED_SHORT);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			xstrcat(out, select_buf);
		}

		if (cluster_flags & CLUSTER_FLAG_BGL) {
			/****** Line 22 (optional) ******/
			select_g_select_jobinfo_sprint(
				job_ptr->select_jobinfo,
				select_buf, sizeof(select_buf),
				SELECT_PRINT_BLRTS_IMAGE);
			if (select_buf[0] != '\0') {
				if (one_liner)
					xstrcat(out, " ");
				else
					xstrcat(out, "\n   ");
				snprintf(tmp_line, sizeof(tmp_line),
					 "BlrtsImage=%s", select_buf);
				xstrcat(out, tmp_line);
			}
		}
		/****** Line 23 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_LINUX_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "LinuxImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "CnloadImage=%s", select_buf);

			xstrcat(out, tmp_line);
		}
		/****** Line 24 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MLOADER_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "MloaderImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
		/****** Line 25 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_RAMDISK_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "RamDiskImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "IoloadImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
	}

	/****** Line 26 (optional) ******/
	if (job_ptr->comment) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ",
			 job_ptr->comment);
		xstrcat(out, tmp_line);
	}

	/****** Line 27 (optional) ******/
	if (job_ptr->batch_script) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		xstrcat(out, "BatchScript=\n");
		xstrcat(out, job_ptr->batch_script);
	}

	/****** Line 28 (optional) ******/
	if (job_ptr->req_switch) {
		char time_buf[32];
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		secs2time_str((time_t) job_ptr->wait4switch, time_buf,
			      sizeof(time_buf));
		snprintf(tmp_line, sizeof(tmp_line), "Switches=%u@%s\n",
			 job_ptr->req_switch, time_buf);
		xstrcat(out, tmp_line);
	}

	/****** Line 29 (optional) ******/
	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;

}
示例#20
0
文件: print.c 项目: Cray/slurm
/* filter job records per input specifications,
 * returns >0 if job should be filter out (not printed) */
static int _filter_job(job_info_t * job)
{
	int filter;
	ListIterator iterator;
	uint32_t *user;
	uint16_t *state_id;
	char *account, *part, *qos, *name;
	squeue_job_step_t *job_step_id;

	if (params.job_list) {
		filter = 1;
		iterator = list_iterator_create(params.job_list);
		while ((job_step_id = list_next(iterator))) {
			if (((job_step_id->array_id == (uint16_t) NO_VAL)   &&
			     ((job_step_id->job_id   == job->array_job_id) ||
			      (job_step_id->job_id   == job->job_id)))      ||
			    ((job_step_id->array_id == job->array_task_id)  &&
			     (job_step_id->job_id   == job->array_job_id))) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 1;
	}

	if (params.part_list) {
		char *token = NULL, *last = NULL, *tmp_name = NULL;

		filter = 1;
		if (job->partition) {
			tmp_name = xstrdup(job->partition);
			token = strtok_r(tmp_name, ",", &last);
		}
		while (token && filter) {
			iterator = list_iterator_create(params.part_list);
			while ((part = list_next(iterator))) {
				if (strcmp(part, token) == 0) {
					filter = 0;
					break;
				}
			}
			list_iterator_destroy(iterator);
			token = strtok_r(NULL, ",", &last);
		}
		xfree(tmp_name);
		if (filter == 1)
			return 2;
	}

	if (params.account_list) {
		filter = 1;
		iterator = list_iterator_create(params.account_list);
		while ((account = list_next(iterator))) {
			 if ((job->account != NULL) &&
			     (strcasecmp(account, job->account) == 0)) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 2;
	}

	if (params.qos_list) {
		filter = 1;
		iterator = list_iterator_create(params.qos_list);
		while ((qos = list_next(iterator))) {
			 if ((job->qos != NULL) &&
			     (strcasecmp(qos, job->qos) == 0)) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 2;
	}

	if (params.state_list) {
		filter = 1;
		iterator = list_iterator_create(params.state_list);
		while ((state_id = list_next(iterator))) {
			if ((*state_id == job->job_state) ||
			    ((*state_id == JOB_COMPLETING) &&
			     (*state_id & job->job_state)) ||
			    ((*state_id == JOB_CONFIGURING) &&
			     (*state_id & job->job_state))) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 3;
	} else {
		if (!IS_JOB_PENDING(job) &&
		    !IS_JOB_RUNNING(job) &&
		    !IS_JOB_SUSPENDED(job) &&
		    !IS_JOB_COMPLETING(job))
			return 4;
	}

	if ((params.nodes)
	    && ((job->nodes == NULL)
		|| (!hostset_intersects(params.nodes, job->nodes))))
		return 5;

	if (params.user_list) {
		filter = 1;
		iterator = list_iterator_create(params.user_list);
		while ((user = list_next(iterator))) {
			if (*user == job->user_id) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 6;
	}

	if (params.reservation) {
		if ((job->resv_name == NULL) ||
		    (strcmp(job->resv_name, params.reservation))) {
			return 7;
		}
	}

	if (params.name_list) {
		filter = 1;
		iterator = list_iterator_create(params.name_list);
		while ((name = list_next(iterator))) {
			if ((job->name != NULL) &&
			     (strcasecmp(name, job->name) == 0)) {
				filter = 0;
				break;
			}
		}
		list_iterator_destroy(iterator);
		if (filter == 1)
			return 8;
	}

	return 0;
}
示例#21
0
static char *	_will_run_test(uint32_t jobid, time_t start_time,
			       char *node_list, int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	char *hostlist, *reply_msg = NULL;
	uint32_t min_nodes, max_nodes, req_nodes;
	int rc;
	time_t start_res, orig_start_time;
	List preemptee_candidates;

	debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if ((node_list == NULL) || (node_list[0] == '\0')) {
		/* assume all nodes available to job for testing */
		avail_bitmap = bit_copy(avail_node_bitmap);
	} else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	start_time = MAX(start_time, start_res);
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap,
			       min_nodes, max_nodes, req_nodes,
			       SELECT_MODE_WILL_RUN,
			       preemptee_candidates, NULL, exc_core_bitmap);
	if (preemptee_candidates)
		list_destroy(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char tmp_str[128];
		*err_code = 0;
		uint32_t proc_cnt = 0;

		xstrcat(reply_msg, "STARTINFO=");
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
                             		    SELECT_JOBDATA_NODE_CNT,
					    &proc_cnt);

#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,",
			 jobid, proc_cnt, (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;
	FREE_NULL_BITMAP(avail_bitmap);
	return reply_msg;
}
示例#22
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit, char *name_ptr,
			char *start_ptr, char *feature_ptr, char *env_ptr,
			char *comment_ptr, char *gres_ptr, char *wckey_ptr)
{
	struct job_record *job_ptr;
	time_t now = time(NULL);
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) {
		info("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (comment_ptr) {
		info("wiki: change job %u comment %s", jobid, comment_ptr);
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
		last_job_update = now;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (env_ptr) {
		bool have_equal = false;
		char old_sep[1];
		int begin = 0, i;

		if (job_ptr->batch_flag == 0) {
			error("wiki: attempt to set environment variables "
			      "for non-batch job %u", jobid);
			return ESLURM_DISABLED;
		}
		for (i=0; ; i++) {
			if (env_ptr[i] == '=') {
				if (have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				have_equal = true;
				if (env_ptr[i+1] == '\"') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\"') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				} else if (env_ptr[i+1] == '\'') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\'') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				}
			}
			if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) {
				if (!have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				old_sep[0] = env_ptr[i];
				env_ptr[i] = '\0';
				xrealloc(job_ptr->details->env_sup,
					 sizeof(char *) *
					 (job_ptr->details->env_cnt+1));
				job_ptr->details->env_sup
						[job_ptr->details->env_cnt++] =
						xstrdup(&env_ptr[begin]);
				info("wiki: for job %u add env: %s",
				     jobid, &env_ptr[begin]);
				env_ptr[i] = old_sep[0];
				if (isspace(old_sep[0]))
					break;
				begin = i + 1;
				have_equal = false;
			}
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = now;
	}

	if (bank_ptr &&
	    (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) {
		return EINVAL;
	}

	if (feature_ptr) {
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u features to %s",
				jobid, feature_ptr);
			job_ptr->details->features = xstrdup(feature_ptr);
			last_job_update = now;
		} else {
			error("wiki: MODIFYJOB features of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (start_ptr) {
		char *end_ptr;
		uint32_t begin_time = strtol(start_ptr, &end_ptr, 10);
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u begin time to %u",
				jobid, begin_time);
			job_ptr->details->begin_time = begin_time;
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB begin_time of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (name_ptr) {
		if (IS_JOB_PENDING(job_ptr)) {
			info("wiki: change job %u name %s", jobid, name_ptr);
			xfree(job_ptr->name);
			job_ptr->name = xstrdup(name_ptr);
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB name of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB hostlist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
				jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
				jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}

		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = now;
		update_accounting = true;
	}

	if (new_node_cnt) {
		job_desc_msg_t job_desc;
#ifdef HAVE_BG
		uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
		static uint16_t cpus_per_node = 0;
		if (!cpus_per_node) {
			select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT,
						&cpus_per_node);
		}
#endif
		if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			error("wiki: MODIFYJOB node count of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}
		memset(&job_desc, 0, sizeof(job_desc_msg_t));

		job_desc.min_nodes = new_node_cnt;
		job_desc.max_nodes = NO_VAL;
		job_desc.select_jobinfo = select_g_select_jobinfo_alloc();

		select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc);

		select_g_select_jobinfo_free(job_desc.select_jobinfo);

		job_ptr->details->min_nodes = job_desc.min_nodes;
		if (job_ptr->details->max_nodes &&
		    (job_ptr->details->max_nodes < job_desc.min_nodes))
			job_ptr->details->max_nodes = job_desc.min_nodes;
		info("wiki: change job %u min_nodes to %u",
		     jobid, new_node_cnt);
#ifdef HAVE_BG
		job_ptr->details->min_cpus = job_desc.min_cpus;
		job_ptr->details->max_cpus = job_desc.max_cpus;
		job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus;

		new_node_cnt = job_ptr->details->min_cpus;
		if (cpus_per_node)
			new_node_cnt /= cpus_per_node;

		/* This is only set up so accounting is set up correctly */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &new_node_cnt);
		/* reset geo since changing this makes any geo
		   potentially invalid */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_GEOMETRY,
					    geometry);
#endif
		last_job_update = now;
		update_accounting = true;
	}

	if (gres_ptr) {
		char *orig_gres;

		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB GRES of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}

		orig_gres = job_ptr->gres;
		job_ptr->gres = NULL;
		if (gres_ptr[0])
			job_ptr->gres = xstrdup(gres_ptr);
		if (gres_plugin_job_state_validate(job_ptr->gres,
						   &job_ptr->gres_list)) {
			error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr);
			xfree(job_ptr->gres);
			job_ptr->gres = orig_gres;
			return ESLURM_INVALID_GRES;
		}
		xfree(orig_gres);
	}

	if (wckey_ptr) {
		int rc = update_job_wckey("update_job", job_ptr, wckey_ptr);
		if (rc != SLURM_SUCCESS) {
			error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr);
			return rc;
		}
	}

	if (update_accounting) {
		if (job_ptr->details && job_ptr->details->begin_time) {
			/* Update job record in accounting to reflect
			 * the changes */
			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
		}
	}

	return SLURM_SUCCESS;
}
示例#23
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit)
{
	struct job_record *job_ptr;
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr)) {
		error("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = time(NULL);
	}

	if (bank_ptr) {
		if (update_job_account("wiki", job_ptr, bank_ptr)
		   != SLURM_SUCCESS)
			return EINVAL;
		else
			update_accounting = true;
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB tasklist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
			     jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
			     jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}
		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = time(NULL);
		update_accounting = true;
	}
	if (new_node_cnt) {
		if (IS_JOB_PENDING(job_ptr) && job_ptr->details) {
			job_ptr->details->min_nodes = new_node_cnt;
			if (job_ptr->details->max_nodes
			&&  (job_ptr->details->max_nodes < new_node_cnt))
				job_ptr->details->max_nodes = new_node_cnt;
			info("wiki: change job %u min_nodes to %u",
				jobid, new_node_cnt);
			last_job_update = time(NULL);
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB node count of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (update_accounting) {
		/* Update job record in accounting to reflect changes */
		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
	}

	return SLURM_SUCCESS;
}
示例#24
0
文件: backfill.c 项目: perryh/slurm
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int sched_timeout = 2, yield_sleep = 1;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;

#ifdef HAVE_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	sched_start = now = time(NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	bf_last_yields = 0;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt + 3));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	while ((job_queue_rec = (job_queue_rec_t *)
				list_pop_bottom(job_queue, sort_job_queue2))) {
		job_ptr  = job_queue_rec->job_ptr;
		orig_time_limit = job_ptr->time_limit;

		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 0;
			START_TIMER;
		}

		part_ptr = job_queue_rec->part_ptr;
		job_test_count++;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != (uint16_t) NO_VAL) {
			if (reject_array_job_id == job_ptr->array_job_id)
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else {
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] > max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: have already "
						      "checked %u jobs for "
						      "user %u; skipping "
						      "job %u",
						      max_backfill_job_per_user,
						      job_ptr->user_id,
						      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks 2"
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			uint32_t save_time_limit = job_ptr->time_limit;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL)
					job_ptr->time_limit = comp_time_limit;
				else
					job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (job_ptr->time_limit * 60);
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else {
				job_ptr->time_limit = orig_time_limit;
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happended. */
				job_ptr->start_time = 0;
				break;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}
示例#25
0
文件: gang.c 项目: corburn/slurm
/* ensure that all jobs running in SLURM are accounted for.
 * this procedure assumes that the gs data has already been
 * locked by the caller!
 */
static void _scan_slurm_job_list(void)
{
	struct job_record *job_ptr;
	struct gs_part *p_ptr;
	int i;
	ListIterator job_iterator;
	char *part_name;

	if (!job_list) {	/* no jobs */
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
			info("gang: _scan_slurm_job_list: job_list NULL");
		return;
	}
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _scan_slurm_job_list: job_list exists...");
	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
			info("gang: _scan_slurm_job_list: checking job %u",
			    job_ptr->job_id);
		}
		if (IS_JOB_PENDING(job_ptr))
			continue;
		if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0))
			continue;	/* not suspended by us */

		if (job_ptr->part_ptr && job_ptr->part_ptr->name)
			part_name = job_ptr->part_ptr->name;
		else
			part_name = job_ptr->partition;

		if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) {
			/* are we tracking this job already? */
			p_ptr = list_find_first(gs_part_list, _find_gs_part,
						part_name);
			if (!p_ptr) /* no partition */
				continue;
			i = _find_job_index(p_ptr, job_ptr->job_id);
			if (i >= 0) /* we're tracking it, so continue */
				continue;

			/* We're not tracking this job. Resume it if it's
			 * suspended, and then add it to the job list. */

			if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) {
				/* The likely scenario here is that the
				 * failed over, and this is a job that gang
				 * had previously suspended. It's not possible
				 * to determine the previous order of jobs
				 * without preserving gang state, which is not
				 * worth the extra infrastructure. Just resume
				 * the job and then add it to the job list.
				 */
				_resume_job(job_ptr->job_id);
			}

			_add_job_to_part(p_ptr, job_ptr);
			continue;
		}

		/* if the job is not pending, suspended, or running, then
		 * it's completing or completed. Make sure we've released
		 * this job */
		p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
		if (!p_ptr) /* no partition */
			continue;
		_remove_job_from_part(job_ptr->job_id, p_ptr, false);
	}
	list_iterator_destroy(job_iterator);

	/* now that all of the old jobs have been flushed out,
	 * update the active row of all partitions */
	_update_all_active_rows();

	return;
}
示例#26
0
/*
 * scontrol_hold - perform some job hold/release operation
 * IN op	- hold/release operation
 * IN job_str	- a job ID or job name
 * RET 0 if no slurm error, errno otherwise. parsing error prints
 *		error message and returns 0
 */
extern int
scontrol_hold(char *op, char *job_str)
{
	static uint32_t last_job_id = NO_VAL;
	static job_info_msg_t *jobs = NULL;
	job_array_resp_msg_t *resp = NULL;
	int i, rc = SLURM_SUCCESS, rc2;
	int j;
	job_desc_msg_t job_msg;
	uint32_t job_id = 0;
	char *job_name = NULL;
	char *job_id_str = NULL;
	slurm_job_info_t *job_ptr;

	if (job_str && !xstrncasecmp(job_str, "JobID=", 6))
		job_str += 6;
	if (job_str && !xstrncasecmp(job_str, "Job=", 4))
		job_str += 4;

	slurm_init_job_desc_msg (&job_msg);
	if ((xstrncasecmp(op, "holdu", 5) == 0) ||
	    (xstrncasecmp(op, "uhold", 5) == 0)) {
		job_msg.priority = 0;
		job_msg.alloc_sid = ALLOC_SID_USER_HOLD;
	} else if (xstrncasecmp(op, "hold", 4) == 0) {
		job_msg.priority = 0;
		job_msg.alloc_sid = 0;
	} else
		job_msg.priority = INFINITE;

	if (_is_job_id(job_str)) {
		while ((job_msg.job_id_str = _next_job_id())) {
			rc2 = slurm_update_job2(&job_msg, &resp);
			if (rc2 != SLURM_SUCCESS) {
				rc2 = slurm_get_errno();
				rc = MAX(rc, rc2);
				exit_code = 1;
				if (quiet_flag != 1) {
					fprintf(stderr, "%s for job %s\n",
						slurm_strerror(rc2),
						job_msg.job_id_str);
				}
			} else if (resp) {
				for (i = 0; i < resp->job_array_count; i++) {
					if ((resp->error_code[i]==SLURM_SUCCESS)
					    && (resp->job_array_count == 1))
						continue;
					exit_code = 1;
					if (quiet_flag == 1)
						continue;
					fprintf(stderr, "%s: %s\n",
						resp->job_array_id[i],
						slurm_strerror(resp->
							       error_code[i]));
				}
				slurm_free_job_array_resp(resp);
				resp = NULL;
			}
			job_msg.job_id_str = _next_job_id();
		}
		return rc;
	} else if (job_str) {
		if (!xstrncasecmp(job_str, "Name=", 5)) {
			job_str += 5;
			job_id = 0;
			job_name = job_str;
			last_job_id = NO_VAL;
		} else {
			exit_code = 1;
			rc = ESLURM_INVALID_JOB_ID;
			slurm_seterrno(rc);
			if (quiet_flag != 1) {
				fprintf(stderr, "%s for job %s\n",
					slurm_strerror(rc), job_str);
			}
			return rc;
		}
	} else {
		last_job_id = NO_VAL;	/* Refresh cache on next call */
		return 0;
	}

	if (last_job_id != job_id) {
		if (scontrol_load_job(&jobs, job_id)) {
			if (quiet_flag == -1)
				slurm_perror ("slurm_load_job error");
			return 1;
		}
		last_job_id = job_id;
	}

	/* set current user, needed e.g., for AllowGroups checks */
	for (i = 0, job_ptr = jobs->job_array; i < jobs->record_count;
	     i++, job_ptr++) {
		if (xstrcmp(job_name, job_ptr->name))
			continue;

		if (!IS_JOB_PENDING(job_ptr)) {
			if (job_ptr->array_task_id != NO_VAL)
				continue;
			slurm_seterrno(ESLURM_JOB_NOT_PENDING);
			rc = MAX(rc, ESLURM_JOB_NOT_PENDING);
		}

		if (job_ptr->array_task_str) {
			xstrfmtcat(job_id_str, "%u_%s",
				   job_ptr->array_job_id,
				   job_ptr->array_task_str);
		} else if (job_ptr->array_task_id != NO_VAL) {
			xstrfmtcat(job_id_str, "%u_%u",
				   job_ptr->array_job_id,
				   job_ptr->array_task_id);
		} else {
			xstrfmtcat(job_id_str, "%u", job_ptr->job_id);
		}
		job_msg.job_id_str = job_id_str;
		rc2 = slurm_update_job2(&job_msg, &resp);
		if (rc2 != SLURM_SUCCESS) {
			rc2 = slurm_get_errno();
			rc = MAX(rc, rc2);
			exit_code = 1;
			if (quiet_flag != 1) {
				fprintf(stderr, "%s for job %s\n",
					slurm_strerror(rc2),
					job_msg.job_id_str);
			}
		} else if (resp) {
			for (j = 0; j < resp->job_array_count; j++) {
				if ((resp->error_code[j] == SLURM_SUCCESS) &&
				    (resp->job_array_count == 1))
					continue;
				exit_code = 1;
				if (quiet_flag == 1)
					continue;
				fprintf(stderr, "%s: %s\n",
					resp->job_array_id[j],
					slurm_strerror(resp->error_code[j]));
			}
			slurm_free_job_array_resp(resp);
			resp = NULL;
		}
		xfree(job_id_str);
	}

	return rc;
}
示例#27
0
文件: start_job.c 项目: VURM/slurm
/*
 * Attempt to start a job
 * jobid     (IN) - job id
 * task_cnt  (IN) - total count of tasks to start
 * hostlist  (IN) - SLURM hostlist expression with no repeated hostnames
 * tasklist  (IN/OUT) - comma separated list of hosts with tasks to be started,
 *                  list hostname once per task to start
 * comment_ptr (IN) - new comment field for the job or NULL for no change
 * err_code (OUT) - Moab error code
 * err_msg  (OUT) - Moab error message
 */
static int	_start_job(uint32_t jobid, int task_cnt, char *hostlist,
			char *tasklist, char *comment_ptr,
			int *err_code, char **err_msg)
{
	int rc = 0, old_task_cnt = 1;
	struct job_record *job_ptr;
	/* Write lock on job info, read lock on node info */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
	char *new_node_list = NULL;
	static char tmp_msg[128];
	bitstr_t *new_bitmap = (bitstr_t *) NULL;
	bitstr_t *save_req_bitmap = (bitstr_t *) NULL;
	bitoff_t i, bsize;
	int ll; /* layout info index */
	char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL;
	size_t node_name_len;
	static uint32_t cr_test = 0, cr_enabled = 0;

	if (cr_test == 0) {
		select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
						&cr_enabled);
		cr_test = 1;
	}

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		rc = -1;
		goto fini;
	}

	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "Job not pending, can't start";
		error("wiki: Attempt to start job %u in state %s",
			jobid, job_state_string(job_ptr->job_state));
		rc = -1;
		goto fini;
	}

	if (comment_ptr) {
		char *reserved = strstr(comment_ptr, "RESERVED:");
		if (reserved) {
			reserved += 9;
			job_ptr->details->reserved_resources =
				strtol(reserved, NULL, 10);
		}
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
	}

	if (task_cnt) {
		new_node_list = xstrdup(hostlist);
		if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) {
			*err_code = -700;
			*err_msg = "Invalid TASKLIST";
			error("wiki: Attempt to set invalid node list for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			rc = -1;
			goto fini;
		}

		if (!bit_super_set(new_bitmap, avail_node_bitmap)) {
			/* Selected node is UP and not responding
			 * or it just went DOWN */
			*err_code = -700;
			*err_msg = "TASKLIST includes non-responsive node";
			error("wiki: Attempt to use non-responsive nodes for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			FREE_NULL_BITMAP(new_bitmap);
			rc = -1;
			goto fini;
		}

		/* User excluded node list incompatible with Wiki
		 * Exclude all nodes not explicitly requested */
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}

	/* Build layout information from tasklist (assuming that Moab
	 * sends a non-bracketed list of nodes, repeated as many times
	 * as cpus should be used per node); at this point, node names
	 * are comma-separated. This is _not_ a fast algorithm as it
	 * performs many string compares. */
	xfree(job_ptr->details->req_node_layout);
	if (task_cnt && cr_enabled) {
		uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task);
		job_ptr->details->req_node_layout = (uint16_t *)
			xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t));
		bsize = bit_size(new_bitmap);
		for (i = 0, ll = -1; i < bsize; i++) {
			if (!bit_test(new_bitmap, i))
				continue;
			ll++;
			node_name = node_record_table_ptr[i].name;
			node_name_len  = strlen(node_name);
			if (node_name_len == 0)
				continue;
			node_cur = tasklist;
			while (*node_cur) {
				if ((node_idx = strstr(node_cur, node_name))) {
					if ((node_idx[node_name_len] == ',') ||
				 	    (node_idx[node_name_len] == '\0')) {
						job_ptr->details->
							req_node_layout[ll] +=
							cpus_per_task;
					}
					node_cur = strchr(node_idx, ',');
					if (node_cur)
						continue;
				}
				break;
			}
		}
	}

	/* save and update job state to start now */
	save_req_nodes = job_ptr->details->req_nodes;
	job_ptr->details->req_nodes = new_node_list;
	save_req_bitmap = job_ptr->details->req_node_bitmap;
	job_ptr->details->req_node_bitmap = new_bitmap;
	old_task_cnt = job_ptr->details->min_cpus;
	job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt);
	job_ptr->priority = 100000000;

 fini:	unlock_slurmctld(job_write_lock);
	if (rc)
		return rc;

	/* No errors so far */
	(void) schedule(INFINITE);	/* provides own locking */

	/* Check to insure the job was actually started */
	lock_slurmctld(job_write_lock);
	if (job_ptr->job_id != jobid)
		job_ptr = find_job_record(jobid);

	if (job_ptr && (job_ptr->job_id == jobid) &&
	    (!IS_JOB_RUNNING(job_ptr))) {
		uint16_t wait_reason = 0;
		char *wait_string;

		if (IS_JOB_FAILED(job_ptr))
			wait_string = "Invalid request, job aborted";
		else {
			wait_reason = job_ptr->state_reason;
			if (wait_reason == WAIT_HELD) {
				/* some job is completing, slurmctld did
				 * not even try to schedule this job */
				wait_reason = WAIT_RESOURCES;
			}
			wait_string = job_reason_string(wait_reason);
			job_ptr->state_reason = WAIT_HELD;
			xfree(job_ptr->state_desc);
		}
		*err_code = -910 - wait_reason;
		snprintf(tmp_msg, sizeof(tmp_msg),
			"Could not start job %u(%s): %s",
			jobid, new_node_list, wait_string);
		*err_msg = tmp_msg;
		error("wiki: %s", tmp_msg);

		/* restore some of job state */
		job_ptr->priority = 0;
		job_ptr->details->min_cpus = old_task_cnt;
		rc = -1;
	}

	if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) {
		/* Restore required node list in case job requeued */
		xfree(job_ptr->details->req_nodes);
		job_ptr->details->req_nodes = save_req_nodes;
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		job_ptr->details->req_node_bitmap = save_req_bitmap;
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		xfree(job_ptr->details->req_node_layout);
	} else {
		error("wiki: start_job(%u) job missing", jobid);
		xfree(save_req_nodes);
		FREE_NULL_BITMAP(save_req_bitmap);
	}

	unlock_slurmctld(job_write_lock);
	schedule_node_save();	/* provides own locking */
	schedule_job_save();	/* provides own locking */
	return rc;
}
示例#28
0
文件: scancel.c 项目: adammoody/slurm
static int _verify_job_ids(void)
{
	job_info_t *job_ptr;
	int i, j, rc = 0;

	if (opt.job_cnt == 0)
		return rc;

	opt.job_found = xmalloc(sizeof(bool) * opt.job_cnt);
	opt.job_pend  = xmalloc(sizeof(bool) * opt.job_cnt);
	job_ptr = job_buffer_ptr->job_array;
	for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
		/* NOTE: We re-use the job's "assoc_id" value as a flag to
		 * record if the job is referenced in the job list supplied
		 * by the user. */
		job_ptr->assoc_id = 0;
		if (IS_JOB_FINISHED(job_ptr))
			job_ptr->job_id = 0;
		if (job_ptr->job_id == 0)
			continue;

		for (j = 0; j < opt.job_cnt; j++) {
			if (opt.array_id[j] == NO_VAL) {
				if ((opt.job_id[j] == job_ptr->job_id) ||
				    ((opt.job_id[j] == job_ptr->array_job_id) &&
				     (opt.step_id[j] == SLURM_BATCH_SCRIPT))) {
					opt.job_found[j] = true;
				}
			} else if (opt.array_id[j] == INFINITE) {
				if (opt.job_id[j] == job_ptr->array_job_id) {
					opt.job_found[j] = true;
				}
			} else if (opt.job_id[j] != job_ptr->array_job_id) {
				continue;
			} else if (_is_task_in_job(job_ptr, opt.array_id[j])) {
				opt.job_found[j] = true;
			}
			if (opt.job_found[j]) {
				if (IS_JOB_PENDING(job_ptr))
					opt.job_pend[j] = true;
				job_ptr->assoc_id = 1;
			}
		}
		if (job_ptr->assoc_id == 0)
			job_ptr->job_id = 0;
	}

	for (j = 0; j < opt.job_cnt; j++) {
		char *job_id_str = NULL;
		if (!opt.job_found[j])
			rc = 1;
		else
			continue;

		if (opt.verbose < 0) {
			;
		} else if (opt.array_id[j] == NO_VAL) {
			xstrfmtcat(job_id_str, "%u", opt.job_id[j]);
		} else if (opt.array_id[j] == INFINITE) {
			xstrfmtcat(job_id_str, "%u_*", opt.job_id[j]);
		} else {
			xstrfmtcat(job_id_str, "%u_%u", opt.job_id[j],
				   opt.array_id[j]);
		}

		if (opt.verbose < 0) {
			;
		} else if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
			error("Kill job error on job id %s: %s",
			      job_id_str,
			      slurm_strerror(ESLURM_INVALID_JOB_ID));
		} else {
			error("Kill job error on job step id %s.%u: %s",
			      job_id_str, opt.step_id[j],
			      slurm_strerror(ESLURM_INVALID_JOB_ID));
		}
		xfree(job_id_str);

		/* Avoid this job in the cancel_job logic */
		opt.job_id[j] = 0;
	}

	return rc;
}
示例#29
0
/*
 * Attempt to allocate resources and begin file staging for pending jobs.
 */
extern int bb_p_job_try_stage_in(List job_queue)
{
	bb_job_queue_rec_t *job_rec;
	List job_candidates;
	ListIterator job_iter;
	struct job_record *job_ptr;
	uint64_t bb_size;
	int rc;

	if (bb_state.bb_config.debug_flag)
		info("%s: %s", plugin_type,  __func__);

	if (!bb_state.bb_config.start_stage_in)
		return SLURM_ERROR;

	/* Identify candidates to be allocated burst buffers */
	job_candidates = list_create(bb_job_queue_del);
	job_iter = list_iterator_create(job_queue);
	while ((job_ptr = list_next(job_iter))) {
		if (!IS_JOB_PENDING(job_ptr) ||
		    (job_ptr->start_time == 0) ||
		    (job_ptr->burst_buffer == NULL) ||
		    (job_ptr->burst_buffer[0] == '\0'))
			continue;
		if (job_ptr->array_recs && (job_ptr->array_task_id == NO_VAL))
			continue;
		bb_size = _get_bb_size(job_ptr);
		if (bb_size == 0)
			continue;
		job_rec = xmalloc(sizeof(bb_job_queue_rec_t));
		job_rec->job_ptr = job_ptr;
		job_rec->bb_size = bb_size;
		list_push(job_candidates, job_rec);
	}
	list_iterator_destroy(job_iter);

	/* Sort in order of expected start time */
	list_sort(job_candidates, bb_job_queue_sort);

	pthread_mutex_lock(&bb_state.bb_mutex);
	bb_set_use_time(&bb_state);
	job_iter = list_iterator_create(job_candidates);
	while ((job_rec = list_next(job_iter))) {
		job_ptr = job_rec->job_ptr;
		bb_size = job_rec->bb_size;

		if (bb_find_alloc_rec(&bb_state, job_ptr))
			continue;

		rc = _test_size_limit(job_ptr, bb_size);
		if (rc == 1)
			continue;
		else if (rc == 2)
			break;

		_alloc_job_bb(job_ptr, bb_size);
	}
	list_iterator_destroy(job_iter);
	pthread_mutex_unlock(&bb_state.bb_mutex);
	FREE_NULL_LIST(job_candidates);

	return SLURM_SUCCESS;
}
示例#30
0
文件: scancel.c 项目: adammoody/slurm
static void _cancel_jobid_by_state(uint32_t job_state, int filter_cnt, int *rc)
{
	job_cancel_info_t *cancel_info;
	job_info_t *job_ptr;
	pthread_t dummy;
	int err, i, j;

	if (opt.job_cnt == 0)
		return;

	for (j = 0; j < opt.job_cnt; j++) {
		if (opt.job_id[j] == 0)
			continue;
		if ((job_state == JOB_PENDING) && !opt.job_pend[j])
			continue;

		job_ptr = job_buffer_ptr->job_array;
		for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
			if (IS_JOB_FINISHED(job_ptr))
				job_ptr->job_id = 0;
			if (job_ptr->job_id == 0)
				continue;
			if ((opt.step_id[j] != SLURM_BATCH_SCRIPT) &&
			    IS_JOB_PENDING(job_ptr)) {
				/* User specified #.# for step, but the job ID
				 * may be job array leader with part of job
				 * array running with other tasks pending */
				continue;
			}

			opt.job_found[j] = false;
			if (opt.array_id[j] == NO_VAL) {
				if ((opt.job_id[j] == job_ptr->job_id) ||
				    ((opt.job_id[j] == job_ptr->array_job_id) &&
				     (opt.step_id[j] == SLURM_BATCH_SCRIPT))) {
					opt.job_found[j] = true;
				}
			} else if (opt.array_id[j] == INFINITE) {
				if (opt.job_id[j] == job_ptr->array_job_id) {
					opt.job_found[j] = true;
				}
			} else if (opt.job_id[j] != job_ptr->array_job_id) {
				continue;
			} else if (_is_task_in_job(job_ptr, opt.array_id[j])) {
				opt.job_found[j] = true;
			}

			if (!opt.job_found[j])
				continue;

			if (opt.interactive &&
			    (_confirmation(job_ptr, opt.step_id[j]) == 0)) {
				job_ptr->job_id = 0;	/* Don't check again */
				continue;
			}

			slurm_mutex_lock(&num_active_threads_lock);
			num_active_threads++;
			while (num_active_threads > MAX_THREADS) {
				slurm_cond_wait(&num_active_threads_cond,
						&num_active_threads_lock);
			}
			slurm_mutex_unlock(&num_active_threads_lock);

			cancel_info = (job_cancel_info_t *)
				      xmalloc(sizeof(job_cancel_info_t));
			cancel_info->rc      = rc;
			cancel_info->sig     = opt.signal;
			cancel_info->num_active_threads = &num_active_threads;
			cancel_info->num_active_threads_lock =
					&num_active_threads_lock;
			cancel_info->num_active_threads_cond =
					&num_active_threads_cond;
			if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
				cancel_info->job_id_str =
					_build_jobid_str(job_ptr);
				err = pthread_create(&dummy, &attr,
						     _cancel_job_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_job_id(cancel_info);
				job_ptr->job_id = 0;
			} else {
				cancel_info->job_id = job_ptr->job_id;
				cancel_info->step_id = opt.step_id[j];
				err = pthread_create(&dummy, &attr,
						     _cancel_step_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_step_id(cancel_info);
			}

			if (opt.interactive) {
				/* Print any error message for first job before
				 * starting confirmation of next job */
				slurm_mutex_lock(&num_active_threads_lock);
				while (num_active_threads > 0) {
					slurm_cond_wait(&num_active_threads_cond,
							&num_active_threads_lock);
				}
				slurm_mutex_unlock(&num_active_threads_lock);
			}
		}
	}
}