예제 #1
0
extern int up_nodecard(char *mp_name, bitstr_t *ionode_bitmap)
{
	ListIterator itr = NULL;
	bg_record_t *bg_record = NULL;
	struct node_record *node_ptr = NULL;
	int mp_bit = 0;
	int ret = 0;

	xassert(mp_name);
	xassert(ionode_bitmap);

	node_ptr = find_node_record(mp_name);
	if (!node_ptr) {
		error ("down_sub_node_blocks: invalid node specified %s",
		       mp_name);
		return EINVAL;
	}
	mp_bit = (node_ptr - node_record_table_ptr);

	slurm_mutex_lock(&block_state_mutex);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		if (bg_record->job_running != BLOCK_ERROR_STATE)
			continue;
		if (!bit_test(bg_record->mp_bitmap, mp_bit))
			continue;

		if (!bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) {
			continue;
		}
		resume_block(bg_record);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	/* FIX ME: This needs to call the opposite of
	   slurm_drain_nodes which does not yet exist.
	*/
	if ((ret = node_already_down(mp_name))) {
		/* means it was drained */
		if (ret == 2) {
			/* debug("node %s put back into service after " */
/* 			      "being in an error state", */
/* 			      mp_name); */
		}
	}

	return SLURM_SUCCESS;
}
예제 #2
0
파일: bg_core.c 프로젝트: fafik23/slurm
/* block_state_mutex should be unlocked before calling this */
extern void free_block_list(uint32_t job_id, List track_list,
			    bool destroy, bool wait)
{
	bg_record_t *bg_record = NULL;
	int retries;
	ListIterator itr = NULL;
	bg_free_block_list_t *bg_free_list;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;
	List kill_job_list = NULL;
	kill_job_struct_t *freeit;

	if (!track_list || !list_count(track_list))
		return;

	bg_free_list = xmalloc(sizeof(bg_free_block_list_t));
	bg_free_list->track_list = list_create(NULL);
	bg_free_list->destroy = destroy;
	bg_free_list->job_id = job_id;

	slurm_mutex_lock(&block_state_mutex);
	list_transfer(bg_free_list->track_list, track_list);
	itr = list_iterator_create(bg_free_list->track_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was already destroyed %p", bg_record);
			continue;
		}
		bg_record->free_cnt++;

		/* just so we don't over write a different thread that
		   wants this block destroyed */
		if (destroy && !bg_record->destroy)
			bg_record->destroy = destroy;

		if (destroy && (bg_record->state & BG_BLOCK_ERROR_FLAG))
			resume_block(bg_record);

		/* This means we are wanting this block free so we can
		   run this job on it, so it is ok to have the job
		   remain here.  Only checking for jobs should go
		   below this.
		*/
		if (bg_record->modifying) {
			debug("free_block_list: Just FYI, we are "
			      "freeing a block (%s) that "
			      "has at least one pending job.",
			      bg_record->bg_block_id);
			continue;
		}

		if (bg_record->job_ptr
		    && !IS_JOB_FINISHED(bg_record->job_ptr)) {
			info("We are freeing a block (%s) that "
			     "has job %u(%u).",
			     bg_record->bg_block_id,
			     bg_record->job_ptr->job_id,
			     bg_record->job_running);
			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			freeit = xmalloc(sizeof(kill_job_struct_t));
			freeit->jobid = bg_record->job_ptr->job_id;
			list_push(kill_job_list, freeit);
		} else if (bg_record->job_list
			   && list_count(bg_record->job_list)) {
			struct job_record *job_ptr;
			ListIterator itr;

			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			info("We are freeing a block (%s) that has at "
			     "least 1 job.",
			     bg_record->bg_block_id);
			itr = list_iterator_create(bg_record->job_list);
			while ((job_ptr = list_next(itr))) {
				if ((job_ptr->magic != JOB_MAGIC)
				    || IS_JOB_FINISHED(job_ptr))
					continue;
				freeit = xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = job_ptr->job_id;
				list_push(kill_job_list, freeit);
			}
			list_iterator_destroy(itr);
		}
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (kill_job_list) {
		bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0);
		FREE_NULL_LIST(kill_job_list);
	}

	if (wait) {
		/* Track_freeing_blocks waits until the list is done
		   and frees the memory of bg_free_list.
		*/
		_track_freeing_blocks(bg_free_list);
		return;
	}

	/* _track_freeing_blocks handles cleanup */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      _track_freeing_blocks,
			      bg_free_list)) {
		error("pthread_create error %m");
		if (++retries > MAX_PTHREAD_RETRIES)
			fatal("Can't create pthread");
		/* sleep and retry */
		usleep(1000);
	}
	slurm_attr_destroy(&attr_agent);
	return;
}