Exemple #1
0
static void _requeue_when_finished(uint32_t job_id)
{
	/* Locks: read job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
	struct job_record *job_ptr;

	while (1) {
		lock_slurmctld(job_write_lock);
		job_ptr = find_job_record(job_id);
		if (IS_JOB_FINISHED(job_ptr)) {
			job_ptr->job_state = JOB_PENDING;
			job_ptr->details->submit_time = time(NULL);
			job_ptr->restart_cnt++;
			/* Since the job completion logger
			 * removes the submit we need to add it again. */
			acct_policy_add_job_submit(job_ptr);
			unlock_slurmctld(job_write_lock);
			break;
		} else {
			unlock_slurmctld(job_write_lock);
			sleep(1);
		}
	}
}
Exemple #2
0
static int _ckpt_step(struct step_record * step_ptr, uint16_t wait, int vacate)
{
	struct check_job_info *check_ptr;
	struct job_record *job_ptr;
	char *argv[3];

	xassert(step_ptr);
	check_ptr = (struct check_job_info *) step_ptr->check_job;
	xassert(check_ptr);
	job_ptr = step_ptr->job_ptr;
	xassert(job_ptr);

	if (IS_JOB_FINISHED(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (check_ptr->disabled)
		return ESLURM_DISABLED;

	argv[0] = "ompi-checkpoint";
	if (vacate) {
		argv[1] = "--term";
		argv[2] = NULL;
	} else
		argv[1] = NULL;
	srun_exec(step_ptr, argv);
	check_ptr->time_stamp = time(NULL);
	check_ptr->wait_time  = wait;
	info("checkpoint requested for job %u.%u",
		job_ptr->job_id, step_ptr->step_id);
	return SLURM_SUCCESS;
}
Exemple #3
0
static int _list_find_sicp_old(void *sicp_entry, void *key)
{
    sicp_job_t *sicp_ptr = (sicp_job_t *)sicp_entry;
    time_t old;

//FIXME: Do not purge if we lack current information from this cluster
    if (!(IS_JOB_FINISHED(sicp_ptr)))
        return 0;	/* Job still active */

    old = time(NULL) - (24 * 60 * 60);	/* One day */
    if (sicp_ptr->update_time > old)
        return 0;	/* Job still active */

    return 1;
}
static int	_job_notify(uint32_t jobid, char *msg_ptr)
{
	struct job_record *job_ptr;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: NOTIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr)) {
		error("wiki: NOTIFYJOB jobid %u is finished", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	srun_user_message(job_ptr, msg_ptr);
	return SLURM_SUCCESS;
}
Exemple #5
0
/* Send specified signal only to the process launched on node 0.
 * If the request times out, send sig_timeout. */
static int _step_sig(struct step_record * step_ptr, uint16_t wait,
		uint16_t signal, uint16_t sig_timeout)
{
	struct check_job_info *check_ptr;
	struct job_record *job_ptr;
	int i;

	xassert(step_ptr);
	check_ptr = (struct check_job_info *) step_ptr->check_job;
	xassert(check_ptr);
	job_ptr = step_ptr->job_ptr;
	xassert(job_ptr);

	if (IS_JOB_FINISHED(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (check_ptr->disabled)
		return ESLURM_DISABLED;

	check_ptr->node_cnt = 0;	/* re-calculate below */
	for (i = 0; i < node_record_count; i++) {
		if (bit_test(step_ptr->step_node_bitmap, i) == 0)
			continue;
		if (check_ptr->node_cnt++ > 0)
			continue;
		_send_sig(step_ptr->job_ptr->job_id, step_ptr->step_id,
			signal, node_record_table_ptr[i].name,
			node_record_table_ptr[i].slurm_addr);
		_ckpt_enqueue_timeout(step_ptr->job_ptr->job_id,
			step_ptr->step_id, check_ptr->time_stamp,
			sig_timeout, wait, node_record_table_ptr[i].name,
			node_record_table_ptr[i].slurm_addr);
	}

	if (!check_ptr->node_cnt) {
		error("_step_sig: job %u.%u has no nodes", job_ptr->job_id,
			step_ptr->step_id);
		return ESLURM_INVALID_NODE_NAME;
	}

	check_ptr->time_stamp = time(NULL);
	check_ptr->wait_time  = wait;

	info("checkpoint requested for job %u.%u", job_ptr->job_id,
		step_ptr->step_id);
	return SLURM_SUCCESS;
}
Exemple #6
0
static int	_job_signal(uint32_t jobid, uint16_t sig_num)
{
	struct job_record *job_ptr;
	int rc = SLURM_SUCCESS;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL)
		return ESLURM_INVALID_JOB_ID;
	if (IS_JOB_FINISHED(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (job_ptr->batch_flag)
		rc = job_signal(jobid, sig_num, 1, 0, false);
	if (rc == SLURM_SUCCESS)
		rc = job_signal(jobid, sig_num, 0, 0, false);
	return rc;
}
Exemple #7
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit, char *name_ptr,
			char *start_ptr, char *feature_ptr, char *env_ptr,
			char *comment_ptr, char *gres_ptr, char *wckey_ptr)
{
	struct job_record *job_ptr;
	time_t now = time(NULL);
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) {
		info("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (comment_ptr) {
		info("wiki: change job %u comment %s", jobid, comment_ptr);
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
		last_job_update = now;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (env_ptr) {
		bool have_equal = false;
		char old_sep[1];
		int begin = 0, i;

		if (job_ptr->batch_flag == 0) {
			error("wiki: attempt to set environment variables "
			      "for non-batch job %u", jobid);
			return ESLURM_DISABLED;
		}
		for (i=0; ; i++) {
			if (env_ptr[i] == '=') {
				if (have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				have_equal = true;
				if (env_ptr[i+1] == '\"') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\"') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				} else if (env_ptr[i+1] == '\'') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\'') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				}
			}
			if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) {
				if (!have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				old_sep[0] = env_ptr[i];
				env_ptr[i] = '\0';
				xrealloc(job_ptr->details->env_sup,
					 sizeof(char *) *
					 (job_ptr->details->env_cnt+1));
				job_ptr->details->env_sup
						[job_ptr->details->env_cnt++] =
						xstrdup(&env_ptr[begin]);
				info("wiki: for job %u add env: %s",
				     jobid, &env_ptr[begin]);
				env_ptr[i] = old_sep[0];
				if (isspace(old_sep[0]))
					break;
				begin = i + 1;
				have_equal = false;
			}
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = now;
	}

	if (bank_ptr &&
	    (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) {
		return EINVAL;
	}

	if (feature_ptr) {
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u features to %s",
				jobid, feature_ptr);
			job_ptr->details->features = xstrdup(feature_ptr);
			last_job_update = now;
		} else {
			error("wiki: MODIFYJOB features of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (start_ptr) {
		char *end_ptr;
		uint32_t begin_time = strtol(start_ptr, &end_ptr, 10);
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u begin time to %u",
				jobid, begin_time);
			job_ptr->details->begin_time = begin_time;
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB begin_time of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (name_ptr) {
		if (IS_JOB_PENDING(job_ptr)) {
			info("wiki: change job %u name %s", jobid, name_ptr);
			xfree(job_ptr->name);
			job_ptr->name = xstrdup(name_ptr);
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB name of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB hostlist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
				jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
				jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}

		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = now;
		update_accounting = true;
	}

	if (new_node_cnt) {
		job_desc_msg_t job_desc;
#ifdef HAVE_BG
		uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
		static uint16_t cpus_per_node = 0;
		if (!cpus_per_node) {
			select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT,
						&cpus_per_node);
		}
#endif
		if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			error("wiki: MODIFYJOB node count of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}
		memset(&job_desc, 0, sizeof(job_desc_msg_t));

		job_desc.min_nodes = new_node_cnt;
		job_desc.max_nodes = NO_VAL;
		job_desc.select_jobinfo = select_g_select_jobinfo_alloc();

		select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc);

		select_g_select_jobinfo_free(job_desc.select_jobinfo);

		job_ptr->details->min_nodes = job_desc.min_nodes;
		if (job_ptr->details->max_nodes &&
		    (job_ptr->details->max_nodes < job_desc.min_nodes))
			job_ptr->details->max_nodes = job_desc.min_nodes;
		info("wiki: change job %u min_nodes to %u",
		     jobid, new_node_cnt);
#ifdef HAVE_BG
		job_ptr->details->min_cpus = job_desc.min_cpus;
		job_ptr->details->max_cpus = job_desc.max_cpus;
		job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus;

		new_node_cnt = job_ptr->details->min_cpus;
		if (cpus_per_node)
			new_node_cnt /= cpus_per_node;

		/* This is only set up so accounting is set up correctly */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &new_node_cnt);
		/* reset geo since changing this makes any geo
		   potentially invalid */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_GEOMETRY,
					    geometry);
#endif
		last_job_update = now;
		update_accounting = true;
	}

	if (gres_ptr) {
		char *orig_gres;

		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB GRES of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}

		orig_gres = job_ptr->gres;
		job_ptr->gres = NULL;
		if (gres_ptr[0])
			job_ptr->gres = xstrdup(gres_ptr);
		if (gres_plugin_job_state_validate(job_ptr->gres,
						   &job_ptr->gres_list)) {
			error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr);
			xfree(job_ptr->gres);
			job_ptr->gres = orig_gres;
			return ESLURM_INVALID_GRES;
		}
		xfree(orig_gres);
	}

	if (wckey_ptr) {
		int rc = update_job_wckey("update_job", job_ptr, wckey_ptr);
		if (rc != SLURM_SUCCESS) {
			error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr);
			return rc;
		}
	}

	if (update_accounting) {
		if (job_ptr->details && job_ptr->details->begin_time) {
			/* Update job record in accounting to reflect
			 * the changes */
			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
		}
	}

	return SLURM_SUCCESS;
}
Exemple #8
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit)
{
	struct job_record *job_ptr;
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr)) {
		error("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = time(NULL);
	}

	if (bank_ptr) {
		if (update_job_account("wiki", job_ptr, bank_ptr)
		   != SLURM_SUCCESS)
			return EINVAL;
		else
			update_accounting = true;
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB tasklist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
			     jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
			     jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}
		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = time(NULL);
		update_accounting = true;
	}
	if (new_node_cnt) {
		if (IS_JOB_PENDING(job_ptr) && job_ptr->details) {
			job_ptr->details->min_nodes = new_node_cnt;
			if (job_ptr->details->max_nodes
			&&  (job_ptr->details->max_nodes < new_node_cnt))
				job_ptr->details->max_nodes = new_node_cnt;
			info("wiki: change job %u min_nodes to %u",
				jobid, new_node_cnt);
			last_job_update = time(NULL);
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB node count of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (update_accounting) {
		/* Update job record in accounting to reflect changes */
		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
	}

	return SLURM_SUCCESS;
}
Exemple #9
0
static int _check_for_booted_overlapping_blocks(
	List block_list, ListIterator bg_record_itr,
	bg_record_t *bg_record, int overlap_check, List overlapped_list,
	uint16_t query_mode)
{
	bg_record_t *found_record = NULL;
	ListIterator itr = NULL;
	int rc = 0;
	int overlap = 0;
	bool is_test = SELECT_IS_TEST(query_mode);

	/* this test only is for actually picking a block not testing */
	if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC)
		return rc;

	/* Make sure no other blocks are under this block
	   are booted and running jobs
	*/
	itr = list_iterator_create(block_list);
	while ((found_record = (bg_record_t*)list_next(itr)) != NULL) {
		if ((!found_record->bg_block_id)
		    || (bg_record == found_record)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("Don't need to look at myself %s %s",
				     bg_record->bg_block_id,
				     found_record->bg_block_id);
			continue;
		}

		slurm_mutex_lock(&block_state_mutex);
		overlap = blocks_overlap(bg_record, found_record);
		slurm_mutex_unlock(&block_state_mutex);

		if (overlap) {
			overlap = 0;
			/* make the available time on this block
			 * (bg_record) the max of this found_record's job
			 * or the one already set if in overlapped_block_list
			 * since we aren't setting job_running we
			 * don't have to remove them since the
			 * block_list should always be destroyed afterwards.
			 */
			if (is_test && overlapped_list
			    && found_record->job_ptr
			    && bg_record->job_running == NO_JOB_RUNNING) {
				ListIterator itr = list_iterator_create(
					overlapped_list);
				bg_record_t *tmp_rec = NULL;

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("found overlapping block %s "
					     "overlapped %s with job %u",
					     found_record->bg_block_id,
					     bg_record->bg_block_id,
					     found_record->job_ptr->job_id);

				while ((tmp_rec = list_next(itr))) {
					if (tmp_rec == bg_record)
						break;
				}
				list_iterator_destroy(itr);
				if (tmp_rec && tmp_rec->job_ptr->end_time
				    < found_record->job_ptr->end_time)
					tmp_rec->job_ptr =
						found_record->job_ptr;
				else if (!tmp_rec) {
					bg_record->job_ptr =
						found_record->job_ptr;
					list_append(overlapped_list,
						    bg_record);
				}
			}
			/* We already know this block doesn't work
			 * right now so we will if there is another
			 * overlapping block that ends later
			 */
			if (rc)
				continue;
			/* This test is here to check if the block we
			 * chose is not booted or if there is a block
			 * overlapping that we could avoid freeing if
			 * we choose something else
			 */
			if (bg_conf->layout_mode == LAYOUT_OVERLAP
			    && ((overlap_check == 0 && bg_record->state
				 != BG_BLOCK_INITED)
				|| (overlap_check == 1 && found_record->state
				    != BG_BLOCK_FREE))) {

				if (!is_test) {
					rc = 1;
					break;
				}
			}

			if (((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			     || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				  || SELECT_IS_MODE_RUN_NOW(query_mode))
				 && (bg_conf->layout_mode != LAYOUT_DYNAMIC)))
			    && ((found_record->job_running != NO_JOB_RUNNING)
				|| (found_record->state
				    & BG_BLOCK_ERROR_FLAG))) {
				if ((found_record->job_running
				     == BLOCK_ERROR_STATE)
				    || (found_record->state
					& BG_BLOCK_ERROR_FLAG))
					error("can't use %s, "
					      "overlapping block %s "
					      "is in an error state.",
					      bg_record->bg_block_id,
					      found_record->bg_block_id);
				else if (bg_conf->slurm_debug_flags
					 & DEBUG_FLAG_BG_PICK)
					info("can't use %s, there is "
					     "a job (%d) running on "
					     "an overlapping "
					     "block %s",
					     bg_record->bg_block_id,
					     found_record->job_running,
					     found_record->bg_block_id);

				if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
					List tmp_list = list_create(NULL);
					/* this will remove and
					 * destroy the memory for
					 * bg_record
					 */
					list_remove(bg_record_itr);
					slurm_mutex_lock(&block_state_mutex);

					if (bg_record->original) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This was a "
							     "copy %s",
							     bg_record->
							     bg_block_id);
						found_record =
							bg_record->original;
					} else {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("looking for "
							     "original");
						found_record =
							find_org_in_bg_list(
								bg_lists->main,
								bg_record);
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Removing unusable block "
						     "%s from the system.",
						     bg_record->bg_block_id);

					if (!found_record) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This record %s "
							     "wasn't found in "
							     "the "
							     "bg_lists->main, "
							     "no big deal, it "
							     "probably wasn't "
							     "added",
							     bg_record->
							     bg_block_id);
						found_record = bg_record;
					} else
						destroy_bg_record(bg_record);

					list_push(tmp_list, found_record);
					slurm_mutex_unlock(&block_state_mutex);

					/* We need to make sure if a
					   job is running here to not
					   call the regular method since
					   we are inside the job write
					   lock already.
					*/
					if (found_record->job_ptr
					    && !IS_JOB_FINISHED(
						    found_record->job_ptr)) {
						info("Somehow block %s "
						     "is being freed, but "
						     "appears to already have "
						     "a job %u(%u) running "
						     "on it.",
						     found_record->bg_block_id,
						     found_record->
						     job_ptr->job_id,
						     found_record->job_running);
						if (job_requeue(0,
								found_record->
								job_ptr->job_id,
								-1,
								(uint16_t)
								NO_VAL,
								false)) {
							error("Couldn't "
							      "requeue job %u, "
							      "failing it: %s",
							      found_record->
							      job_ptr->job_id,
							      slurm_strerror(
								      rc));
							job_fail(found_record->
								 job_ptr->
								 job_id);
						}
					}

					free_block_list(NO_VAL, tmp_list, 0, 0);
					list_destroy(tmp_list);
				}
				rc = 1;

				if (!is_test)
					break;
			}
		}
	}
	list_iterator_destroy(itr);

	return rc;
}
Exemple #10
0
static void
_cancel_jobs_by_state(uint32_t job_state, int filter_cnt, int *rc)
{
	int i, err;
	job_cancel_info_t *cancel_info;
	job_info_t *job_ptr = job_buffer_ptr->job_array;
	pthread_t dummy;

	/* Spawn a thread to cancel each job or job step marked for
	 * cancellation */
	if (opt.job_cnt) {
		_cancel_jobid_by_state(job_state, filter_cnt, rc);
		return;
	}

	for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
		if (IS_JOB_FINISHED(job_ptr))
			job_ptr->job_id = 0;
		if (job_ptr->job_id == 0)
			continue;

		if ((job_state < JOB_END) &&
		    (job_ptr->job_state != job_state))
			continue;

		if (opt.interactive &&
		    (_confirmation(job_ptr, SLURM_BATCH_SCRIPT) == 0)) {
			job_ptr->job_id = 0;
			continue;
		}

		cancel_info = (job_cancel_info_t *)
			xmalloc(sizeof(job_cancel_info_t));
		cancel_info->job_id_str = _build_jobid_str(job_ptr);
		cancel_info->rc      = rc;
		cancel_info->sig     = opt.signal;
		cancel_info->num_active_threads = &num_active_threads;
		cancel_info->num_active_threads_lock =
			&num_active_threads_lock;
		cancel_info->num_active_threads_cond =
			&num_active_threads_cond;

		slurm_mutex_lock(&num_active_threads_lock);
		num_active_threads++;
		while (num_active_threads > MAX_THREADS) {
			slurm_cond_wait(&num_active_threads_cond,
					&num_active_threads_lock);
		}
		slurm_mutex_unlock(&num_active_threads_lock);

		err = pthread_create(&dummy, &attr, _cancel_job_id,cancel_info);
		if (err)   /* Run in-line if thread create fails */
			_cancel_job_id(cancel_info);
		job_ptr->job_id = 0;

		if (opt.interactive) {
			/* Print any error message for first job before
			 * starting confirmation of next job */
			slurm_mutex_lock(&num_active_threads_lock);
			while (num_active_threads > 0) {
				slurm_cond_wait(&num_active_threads_cond,
						&num_active_threads_lock);
			}
			slurm_mutex_unlock(&num_active_threads_lock);
		}
	}
}
Exemple #11
0
/* _filter_job_records - filtering job information per user specification
 * RET Count of job's filtered out OTHER than for job ID value */
static int _filter_job_records (void)
{
	int filter_cnt = 0;
	int i;
	job_info_t *job_ptr = NULL;
	uint32_t job_base_state;

	job_ptr = job_buffer_ptr->job_array;
	for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
		if (IS_JOB_FINISHED(job_ptr))
			job_ptr->job_id = 0;
		if (job_ptr->job_id == 0)
			continue;

		job_base_state = job_ptr->job_state & JOB_STATE_BASE;
		if ((job_base_state != JOB_PENDING) &&
		    (job_base_state != JOB_RUNNING) &&
		    (job_base_state != JOB_SUSPENDED)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if (opt.account != NULL &&
		    xstrcmp(job_ptr->account, opt.account)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if (opt.job_name != NULL &&
		    xstrcmp(job_ptr->name, opt.job_name)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if ((opt.partition != NULL) &&
		    xstrcmp(job_ptr->partition, opt.partition)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if ((opt.qos != NULL) &&
		    xstrcmp(job_ptr->qos, opt.qos)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if ((opt.reservation != NULL) &&
		    xstrcmp(job_ptr->resv_name, opt.reservation)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if ((opt.state != JOB_END) &&
		    (job_ptr->job_state != opt.state)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if ((opt.user_name != NULL) &&
		    (job_ptr->user_id != opt.user_id)) {
			job_ptr->job_id = 0;
			filter_cnt++;
			continue;
		}

		if (opt.nodelist != NULL) {
			/* If nodelist contains a '/', treat it as a file name */
			if (strchr(opt.nodelist, '/') != NULL) {
				char *reallist;
				reallist = slurm_read_hostfile(opt.nodelist,
							       NO_VAL);
				if (reallist) {
					xfree(opt.nodelist);
					opt.nodelist = reallist;
				}
			}

			hostset_t hs = hostset_create(job_ptr->nodes);
			if (!hostset_intersects(hs, opt.nodelist)) {
				job_ptr->job_id = 0;
				filter_cnt++;
				hostset_destroy(hs);
				continue;
			} else {
				hostset_destroy(hs);
			}
		}

		if (opt.wckey != NULL) {
			char *job_key = job_ptr->wckey;

			/*
			 * A wckey that begins with '*' indicates that the wckey
			 * was applied by default.  When the --wckey option does
			 * not begin with a '*', act on all wckeys with the same
			 * name, default or not.
			 */
			if ((opt.wckey[0] != '*') && (job_key[0] == '*'))
				job_key++;

			if (xstrcmp(job_key, opt.wckey) != 0) {
				job_ptr->job_id = 0;
				filter_cnt++;
				continue;
			}
		}
	}

	return filter_cnt;
}
Exemple #12
0
static int _verify_job_ids(void)
{
	job_info_t *job_ptr;
	int i, j, rc = 0;

	if (opt.job_cnt == 0)
		return rc;

	opt.job_found = xmalloc(sizeof(bool) * opt.job_cnt);
	opt.job_pend  = xmalloc(sizeof(bool) * opt.job_cnt);
	job_ptr = job_buffer_ptr->job_array;
	for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
		/* NOTE: We re-use the job's "assoc_id" value as a flag to
		 * record if the job is referenced in the job list supplied
		 * by the user. */
		job_ptr->assoc_id = 0;
		if (IS_JOB_FINISHED(job_ptr))
			job_ptr->job_id = 0;
		if (job_ptr->job_id == 0)
			continue;

		for (j = 0; j < opt.job_cnt; j++) {
			if (opt.array_id[j] == NO_VAL) {
				if ((opt.job_id[j] == job_ptr->job_id) ||
				    ((opt.job_id[j] == job_ptr->array_job_id) &&
				     (opt.step_id[j] == SLURM_BATCH_SCRIPT))) {
					opt.job_found[j] = true;
				}
			} else if (opt.array_id[j] == INFINITE) {
				if (opt.job_id[j] == job_ptr->array_job_id) {
					opt.job_found[j] = true;
				}
			} else if (opt.job_id[j] != job_ptr->array_job_id) {
				continue;
			} else if (_is_task_in_job(job_ptr, opt.array_id[j])) {
				opt.job_found[j] = true;
			}
			if (opt.job_found[j]) {
				if (IS_JOB_PENDING(job_ptr))
					opt.job_pend[j] = true;
				job_ptr->assoc_id = 1;
			}
		}
		if (job_ptr->assoc_id == 0)
			job_ptr->job_id = 0;
	}

	for (j = 0; j < opt.job_cnt; j++) {
		char *job_id_str = NULL;
		if (!opt.job_found[j])
			rc = 1;
		else
			continue;

		if (opt.verbose < 0) {
			;
		} else if (opt.array_id[j] == NO_VAL) {
			xstrfmtcat(job_id_str, "%u", opt.job_id[j]);
		} else if (opt.array_id[j] == INFINITE) {
			xstrfmtcat(job_id_str, "%u_*", opt.job_id[j]);
		} else {
			xstrfmtcat(job_id_str, "%u_%u", opt.job_id[j],
				   opt.array_id[j]);
		}

		if (opt.verbose < 0) {
			;
		} else if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
			error("Kill job error on job id %s: %s",
			      job_id_str,
			      slurm_strerror(ESLURM_INVALID_JOB_ID));
		} else {
			error("Kill job error on job step id %s.%u: %s",
			      job_id_str, opt.step_id[j],
			      slurm_strerror(ESLURM_INVALID_JOB_ID));
		}
		xfree(job_id_str);

		/* Avoid this job in the cancel_job logic */
		opt.job_id[j] = 0;
	}

	return rc;
}
Exemple #13
0
/* block_state_mutex should be unlocked before calling this */
extern int free_block_list(uint32_t job_id, List track_list,
			   bool destroy, bool wait)
{
	bg_record_t *bg_record = NULL;
	int retries;
	ListIterator itr = NULL;
	bg_free_block_list_t *bg_free_list;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (!track_list || !list_count(track_list))
		return SLURM_SUCCESS;

	bg_free_list = xmalloc(sizeof(bg_free_block_list_t));
	bg_free_list->track_list = list_create(NULL);
	bg_free_list->destroy = destroy;
	bg_free_list->job_id = job_id;

	slurm_mutex_lock(&block_state_mutex);
	list_transfer(bg_free_list->track_list, track_list);
	itr = list_iterator_create(bg_free_list->track_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was already destroyed %p", bg_record);
			continue;
		}
		bg_record->free_cnt++;

		if (bg_record->job_ptr
		    && !IS_JOB_FINISHED(bg_record->job_ptr)) {
			info("We are freeing a block (%s) that has job %u(%u).",
			     bg_record->bg_block_id,
			     bg_record->job_ptr->job_id,
			     bg_record->job_running);
			/* This is not thread safe if called from
			   bg_job_place.c anywhere from within
			   submit_job() or at startup. */
			slurm_mutex_unlock(&block_state_mutex);
			bg_requeue_job(bg_record->job_ptr->job_id, 0);
			slurm_mutex_lock(&block_state_mutex);
		}
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (wait) {
		/* Track_freeing_blocks waits until the list is done
		   and frees the memory of bg_free_list.
		*/
		_track_freeing_blocks(bg_free_list);
		return SLURM_SUCCESS;
	}

	/* _track_freeing_blocks handles cleanup */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      _track_freeing_blocks,
			      bg_free_list)) {
		error("pthread_create error %m");
		if (++retries > MAX_PTHREAD_RETRIES)
			fatal("Can't create "
			      "pthread");
		/* sleep and retry */
		usleep(1000);
	}
	slurm_attr_destroy(&attr_agent);
	return SLURM_SUCCESS;
}
static uint32_t	_get_job_end_time(struct job_record *job_ptr)
{
	if (IS_JOB_FINISHED(job_ptr))
		return (uint32_t) job_ptr->end_time;
	return (uint32_t) 0;
}
static char *	_dump_job(struct job_record *job_ptr, time_t update_time)
{
	char *buf = NULL, tmp[16384];
	char *gname, *pname, *quote, *uname;
	uint32_t end_time, suspend_time;
	uint64_t min_mem;
	int i, rej_sent = 0;

	if (!job_ptr)
		return NULL;

	snprintf(tmp, sizeof(tmp), "%u:STATE=%s;",
		job_ptr->job_id, _get_job_state(job_ptr));
	xstrcat(buf, tmp);

	if (update_time > last_job_update)
		return buf;

	if (IS_JOB_PENDING(job_ptr)) {
		char *req_features = _get_job_features(job_ptr);
		if (req_features) {
			snprintf(tmp, sizeof(tmp),
				"RFEATURES=%s;", req_features);
			xstrcat(buf, tmp);
			xfree(req_features);
		}
		if ((job_ptr->details) &&
		    (job_ptr->details->req_nodes) &&
		    (job_ptr->details->req_nodes[0])) {
			char *hosts = bitmap2wiki_node_name(
				job_ptr->details->req_node_bitmap);
			snprintf(tmp, sizeof(tmp),
				"HOSTLIST=%s;", hosts);
			xstrcat(buf, tmp);
			xfree(hosts);
		}
		if ((job_ptr->details) &&
		    (job_ptr->details->exc_nodes) &&
		    (job_ptr->details->exc_nodes[0])) {
			char *hosts = bitmap2wiki_node_name(
				job_ptr->details->exc_node_bitmap);
			snprintf(tmp, sizeof(tmp),
				"EXCLUDE_HOSTLIST=%s;", hosts);
			xstrcat(buf, tmp);
			xfree(hosts);
		}
		if ((job_ptr->details) && (job_ptr->details->begin_time)) {
			snprintf(tmp, sizeof(tmp),
				"STARTDATE=%u;", (uint32_t)
				job_ptr->details->begin_time);
			xstrcat(buf, tmp);
		}
		if (job_ptr->details) {
			snprintf(tmp, sizeof(tmp),
				 "MAXNODES=%u;", _get_job_max_nodes(job_ptr));
			xstrcat(buf, tmp);
		}
	} else if (!IS_JOB_FINISHED(job_ptr)) {
		char *hosts;
		hosts = slurm_job2moab_task_list(job_ptr);
		xstrcat(buf, "TASKLIST=");
		xstrcat(buf, hosts);
		xstrcat(buf, ";");
		xfree(hosts);
	}

	if (reject_msg_cnt) {
		/* Possible job requeue/reject message */
		for (i=0; i<REJECT_MSG_MAX; i++) {
			if (reject_msgs[i].job_id != job_ptr->job_id)
				continue;
			snprintf(tmp, sizeof(tmp),
				"REJMESSAGE=\"%s\";",
				reject_msgs[i].reason);
			xstrcat(buf, tmp);
			reject_msgs[i].job_id = 0;
			reject_msg_cnt--;
			rej_sent = 1;
			break;
		}
	}
	if ((rej_sent == 0) && IS_JOB_FAILED(job_ptr)) {
		snprintf(tmp, sizeof(tmp),
			"REJMESSAGE=\"%s\";",
			job_reason_string(job_ptr->state_reason));
		xstrcat(buf, tmp);
	}

	if (!IS_JOB_FINISHED(job_ptr) && job_ptr->details &&
	    job_ptr->details->work_dir) {
		if ((quote = strchr(job_ptr->details->work_dir, (int) '\"'))) {
			/* Moab does not like strings containing a quote */
			*quote = '\0';
			snprintf(tmp, sizeof(tmp), "IWD=\"%s\";",
				 job_ptr->details->work_dir);
			*quote = '\"';
		} else {
			snprintf(tmp, sizeof(tmp), "IWD=\"%s\";",
				 job_ptr->details->work_dir);
		}
		xstrcat(buf, tmp);
	}

	if (job_ptr->batch_flag == 0)
		xstrcat(buf, "FLAGS=INTERACTIVE;");

	if (job_ptr->gres) {
		snprintf(tmp, sizeof(tmp),"GRES=\"%s\";", job_ptr->gres);
		xstrcat(buf, tmp);
	}

	if (job_ptr->resp_host) {
		snprintf(tmp, sizeof(tmp),"SUBMITHOST=\"%s\";", job_ptr->resp_host);
		xstrcat(buf, tmp);
	}

	if (job_ptr->wckey) {
		if ((quote = strchr(job_ptr->wckey, (int) '\"'))) {
			/* Moab does not like strings containing a quote */
			*quote = '\0';
			snprintf(tmp, sizeof(tmp),
				"WCKEY=\"%s\";", job_ptr->wckey);
			*quote = '\"';
			xstrcat(buf, tmp);
		} else {
			snprintf(tmp, sizeof(tmp),
				"WCKEY=\"%s\";", job_ptr->wckey);
		}
		xstrcat(buf, tmp);
	}

	snprintf(tmp, sizeof(tmp),
		"UPDATETIME=%u;WCLIMIT=%u;TASKS=%u;",
		(uint32_t) job_ptr->time_last_active,
		(uint32_t) _get_job_time_limit(job_ptr),
		_get_job_tasks(job_ptr));
	xstrcat(buf, tmp);

	if (!IS_JOB_FINISHED(job_ptr)) {
		snprintf(tmp, sizeof(tmp),
			"NODES=%u;",
			_get_pn_min_nodes(job_ptr));
		xstrcat(buf, tmp);
	}

	snprintf(tmp, sizeof(tmp),
		"DPROCS=%u;",
		_get_job_cpus_per_task(job_ptr));
	xstrcat(buf, tmp);

	if (job_ptr->part_ptr)
		pname = job_ptr->part_ptr->name;
	else
		pname = "UNKNOWN";	/* should never see this */
	snprintf(tmp, sizeof(tmp),
		"QUEUETIME=%u;STARTTIME=%u;RCLASS=\"%s\";",
		_get_job_submit_time(job_ptr),
		(uint32_t) job_ptr->start_time, pname);
	xstrcat(buf, tmp);

	min_mem = _get_pn_min_mem(job_ptr);
	if (min_mem & MEM_PER_CPU) {
		min_mem &= ~MEM_PER_CPU;
	}
	snprintf(tmp, sizeof(tmp),
		"RMEM=%"PRIu64";RDISK=%u;",
		 min_mem,
		_get_pn_min_disk(job_ptr));
	xstrcat(buf, tmp);

	_get_job_comment(job_ptr, tmp, sizeof(tmp));
	xstrcat(buf, tmp);

	end_time = _get_job_end_time(job_ptr);
	if (end_time) {
		snprintf(tmp, sizeof(tmp),
			"COMPLETETIME=%u;", end_time);
		xstrcat(buf, tmp);
	}

	suspend_time = _get_job_suspend_time(job_ptr);
	if (suspend_time) {
		snprintf(tmp, sizeof(tmp),
			"SUSPENDTIME=%u;", suspend_time);
		xstrcat(buf, tmp);
	}

	if (job_ptr->account) {
		snprintf(tmp, sizeof(tmp),
			"ACCOUNT=\"%s\";", job_ptr->account);
		xstrcat(buf, tmp);
	}

	if (job_ptr->name && (quote = strchr(job_ptr->name, (int) '\"'))) {
		/* Moab does not like job names containing a quote */
		*quote = '\0';
		snprintf(tmp, sizeof(tmp),
			"NAME=\"%s\";", job_ptr->name);
		*quote = '\"';
	} else {
		snprintf(tmp, sizeof(tmp),
			"NAME=\"%s\";", job_ptr->name);
	}
	xstrcat(buf, tmp);

	if (job_ptr->details &&
	    (update_time > job_ptr->details->submit_time))
		return buf;

	uname = uid_to_string((uid_t) job_ptr->user_id);
	gname = gid_to_string(job_ptr->group_id);
	snprintf(tmp, sizeof(tmp),
		"UNAME=%s;GNAME=%s;", uname, gname);
	xstrcat(buf, tmp);
	xfree(uname);
	xfree(gname);

	return buf;
}
Exemple #16
0
/* block_state_mutex should be unlocked before calling this */
extern void free_block_list(uint32_t job_id, List track_list,
			    bool destroy, bool wait)
{
	bg_record_t *bg_record = NULL;
	int retries;
	ListIterator itr = NULL;
	bg_free_block_list_t *bg_free_list;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;
	List kill_job_list = NULL;
	kill_job_struct_t *freeit;

	if (!track_list || !list_count(track_list))
		return;

	bg_free_list = xmalloc(sizeof(bg_free_block_list_t));
	bg_free_list->track_list = list_create(NULL);
	bg_free_list->destroy = destroy;
	bg_free_list->job_id = job_id;

	slurm_mutex_lock(&block_state_mutex);
	list_transfer(bg_free_list->track_list, track_list);
	itr = list_iterator_create(bg_free_list->track_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was already destroyed %p", bg_record);
			continue;
		}
		bg_record->free_cnt++;

		/* just so we don't over write a different thread that
		   wants this block destroyed */
		if (destroy && !bg_record->destroy)
			bg_record->destroy = destroy;

		if (destroy && (bg_record->state & BG_BLOCK_ERROR_FLAG))
			resume_block(bg_record);

		/* This means we are wanting this block free so we can
		   run this job on it, so it is ok to have the job
		   remain here.  Only checking for jobs should go
		   below this.
		*/
		if (bg_record->modifying) {
			debug("free_block_list: Just FYI, we are "
			      "freeing a block (%s) that "
			      "has at least one pending job.",
			      bg_record->bg_block_id);
			continue;
		}

		if (bg_record->job_ptr
		    && !IS_JOB_FINISHED(bg_record->job_ptr)) {
			info("We are freeing a block (%s) that "
			     "has job %u(%u).",
			     bg_record->bg_block_id,
			     bg_record->job_ptr->job_id,
			     bg_record->job_running);
			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			freeit = xmalloc(sizeof(kill_job_struct_t));
			freeit->jobid = bg_record->job_ptr->job_id;
			list_push(kill_job_list, freeit);
		} else if (bg_record->job_list
			   && list_count(bg_record->job_list)) {
			struct job_record *job_ptr;
			ListIterator itr;

			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			info("We are freeing a block (%s) that has at "
			     "least 1 job.",
			     bg_record->bg_block_id);
			itr = list_iterator_create(bg_record->job_list);
			while ((job_ptr = list_next(itr))) {
				if ((job_ptr->magic != JOB_MAGIC)
				    || IS_JOB_FINISHED(job_ptr))
					continue;
				freeit = xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = job_ptr->job_id;
				list_push(kill_job_list, freeit);
			}
			list_iterator_destroy(itr);
		}
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (kill_job_list) {
		bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0);
		FREE_NULL_LIST(kill_job_list);
	}

	if (wait) {
		/* Track_freeing_blocks waits until the list is done
		   and frees the memory of bg_free_list.
		*/
		_track_freeing_blocks(bg_free_list);
		return;
	}

	/* _track_freeing_blocks handles cleanup */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      _track_freeing_blocks,
			      bg_free_list)) {
		error("pthread_create error %m");
		if (++retries > MAX_PTHREAD_RETRIES)
			fatal("Can't create pthread");
		/* sleep and retry */
		usleep(1000);
	}
	slurm_attr_destroy(&attr_agent);
	return;
}
Exemple #17
0
static void _cancel_jobid_by_state(uint32_t job_state, int filter_cnt, int *rc)
{
	job_cancel_info_t *cancel_info;
	job_info_t *job_ptr;
	pthread_t dummy;
	int err, i, j;

	if (opt.job_cnt == 0)
		return;

	for (j = 0; j < opt.job_cnt; j++) {
		if (opt.job_id[j] == 0)
			continue;
		if ((job_state == JOB_PENDING) && !opt.job_pend[j])
			continue;

		job_ptr = job_buffer_ptr->job_array;
		for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
			if (IS_JOB_FINISHED(job_ptr))
				job_ptr->job_id = 0;
			if (job_ptr->job_id == 0)
				continue;
			if ((opt.step_id[j] != SLURM_BATCH_SCRIPT) &&
			    IS_JOB_PENDING(job_ptr)) {
				/* User specified #.# for step, but the job ID
				 * may be job array leader with part of job
				 * array running with other tasks pending */
				continue;
			}

			opt.job_found[j] = false;
			if (opt.array_id[j] == NO_VAL) {
				if ((opt.job_id[j] == job_ptr->job_id) ||
				    ((opt.job_id[j] == job_ptr->array_job_id) &&
				     (opt.step_id[j] == SLURM_BATCH_SCRIPT))) {
					opt.job_found[j] = true;
				}
			} else if (opt.array_id[j] == INFINITE) {
				if (opt.job_id[j] == job_ptr->array_job_id) {
					opt.job_found[j] = true;
				}
			} else if (opt.job_id[j] != job_ptr->array_job_id) {
				continue;
			} else if (_is_task_in_job(job_ptr, opt.array_id[j])) {
				opt.job_found[j] = true;
			}

			if (!opt.job_found[j])
				continue;

			if (opt.interactive &&
			    (_confirmation(job_ptr, opt.step_id[j]) == 0)) {
				job_ptr->job_id = 0;	/* Don't check again */
				continue;
			}

			slurm_mutex_lock(&num_active_threads_lock);
			num_active_threads++;
			while (num_active_threads > MAX_THREADS) {
				slurm_cond_wait(&num_active_threads_cond,
						&num_active_threads_lock);
			}
			slurm_mutex_unlock(&num_active_threads_lock);

			cancel_info = (job_cancel_info_t *)
				      xmalloc(sizeof(job_cancel_info_t));
			cancel_info->rc      = rc;
			cancel_info->sig     = opt.signal;
			cancel_info->num_active_threads = &num_active_threads;
			cancel_info->num_active_threads_lock =
					&num_active_threads_lock;
			cancel_info->num_active_threads_cond =
					&num_active_threads_cond;
			if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
				cancel_info->job_id_str =
					_build_jobid_str(job_ptr);
				err = pthread_create(&dummy, &attr,
						     _cancel_job_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_job_id(cancel_info);
				job_ptr->job_id = 0;
			} else {
				cancel_info->job_id = job_ptr->job_id;
				cancel_info->step_id = opt.step_id[j];
				err = pthread_create(&dummy, &attr,
						     _cancel_step_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_step_id(cancel_info);
			}

			if (opt.interactive) {
				/* Print any error message for first job before
				 * starting confirmation of next job */
				slurm_mutex_lock(&num_active_threads_lock);
				while (num_active_threads > 0) {
					slurm_cond_wait(&num_active_threads_cond,
							&num_active_threads_lock);
				}
				slurm_mutex_unlock(&num_active_threads_lock);
			}
		}
	}
}