示例#1
0
extern void print_bg_record(bg_record_t* bg_record)
{
	if (!bg_record) {
		error("print_bg_record, record given is null");
		return;
	}
#if _DEBUG
	info(" bg_record: ");
	if (bg_record->bg_block_id)
		info("\tbg_block_id: %s", bg_record->bg_block_id);
	info("\tnodes: %s", bg_record->mp_str);
	info("\tsize: %d MPs %u Nodes %d cpus",
	     bg_record->mp_count,
	     bg_record->cnode_cnt,
	     bg_record->cpu_cnt);
	info("\tgeo: %ux%ux%u", bg_record->geo[X], bg_record->geo[Y],
	     bg_record->geo[Z]);
	info("\tconn_type: %s", conn_type_string(bg_record->conn_type[0]));
#ifdef HAVE_BGL
	info("\tnode_use: %s", node_use_string(bg_record->node_use));
#endif
	if (bg_record->mp_bitmap) {
		char bitstring[BITSIZE];
		bit_fmt(bitstring, BITSIZE, bg_record->mp_bitmap);
		info("\tbitmap: %s", bitstring);
	}
#else
	{
		char tmp_char[256];
		format_node_name(bg_record, tmp_char, sizeof(tmp_char));
		info("Record: BlockID:%s Nodes:%s Conn:%s",
		     bg_record->bg_block_id, tmp_char,
		     conn_type_string(bg_record->conn_type[0]));
	}
#endif
}
示例#2
0
static int _breakup_blocks(List block_list, List new_blocks,
			   select_ba_request_t *request, List my_block_list,
			   bool only_free, bool only_small)
{
	int rc = SLURM_ERROR;
	bg_record_t *bg_record = NULL;
	ListIterator itr = NULL, bit_itr = NULL;
	int total_cnode_cnt=0;
	char start_char[SYSTEM_DIMENSIONS+1];
	bitstr_t *ionodes = bit_alloc(bg_conf->ionodes_per_mp);
	int cnodes = request->procs / bg_conf->cpu_ratio;
	int curr_mp_bit = -1;
	int dim;

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("cpu_count=%d cnodes=%d o_free=%d o_small=%d",
		     request->procs, cnodes, only_free, only_small);

	switch(cnodes) {
	case 16:
		/* a 16 can go anywhere */
		break;
	case 32:
		bit_itr = list_iterator_create(bg_lists->valid_small32);
		break;
	case 64:
		bit_itr = list_iterator_create(bg_lists->valid_small64);
		break;
	case 128:
		bit_itr = list_iterator_create(bg_lists->valid_small128);
		break;
	case 256:
		bit_itr = list_iterator_create(bg_lists->valid_small256);
		break;
	default:
		error("We shouldn't be here with this size %d", cnodes);
		goto finished;
		break;
	}

	/* First try with free blocks a midplane or less.  Then try with the
	 * smallest blocks.
	 */
	itr = list_iterator_create(block_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->free_cnt) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("%s being freed by other job(s), skipping",
				     bg_record->bg_block_id);
			continue;
		}
		/* never look at a block if a job is running */
		if (bg_record->job_running != NO_JOB_RUNNING)
			continue;
		/* on the third time through look for just a block
		 * that isn't used */

		/* check for free blocks on the first and second time */
		if (only_free && (bg_record->state != BG_BLOCK_FREE))
			continue;

		/* check small blocks first */
		if (only_small
		    && (bg_record->cnode_cnt > bg_conf->mp_cnode_cnt))
			continue;

		if (request->avail_mp_bitmap &&
		    !bit_super_set(bg_record->bitmap,
				   request->avail_mp_bitmap)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s has nodes not usable "
				     "by this job",
				     bg_record->bg_block_id);
			continue;
		}

		if (bg_record->cnode_cnt == cnodes) {
			ba_mp_t *ba_mp = NULL;
			if (bg_record->ba_mp_list)
				ba_mp = list_peek(bg_record->ba_mp_list);
			if (!ba_mp) {
				for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
					start_char[dim] = alpha_num[
						bg_record->start[dim]];
				start_char[dim] = '\0';
				request->save_name = xstrdup(start_char);
			} else
				request->save_name = xstrdup(ba_mp->coord_str);

			rc = SLURM_SUCCESS;
			goto finished;
		}
		/* lets see if we can combine some small ones */
		if (bg_record->cnode_cnt < cnodes) {
			char bitstring[BITSIZE];
			bitstr_t *bitstr = NULL;
			int num_over = 0;
			int num_cnodes = bg_record->cnode_cnt;
			int rec_mp_bit = bit_ffs(bg_record->bitmap);

			if (curr_mp_bit != rec_mp_bit) {
				/* Got a different node than
				 * previously, since the list should
				 * be in order of nodes for small blocks
				 * just clear here since the last node
				 * doesn't have any more. */
				curr_mp_bit = rec_mp_bit;
				bit_nclear(ionodes, 0,
					   (bg_conf->ionodes_per_mp-1));
				total_cnode_cnt = 0;
			}

			/* On really busy systems we can get
			   overlapping blocks here.  If that is the
			   case only add that which doesn't overlap.
			*/
			if ((num_over = bit_overlap(
				     ionodes, bg_record->ionode_bitmap))) {
				/* Since the smallest block size is
				   the number of cnodes in an io node,
				   just multiply the num_over by that to
				   get the number of cnodes to remove.
				*/
				if ((num_cnodes -=
				     num_over * bg_conf->smallest_block) <= 0)
					continue;
			}
			bit_or(ionodes, bg_record->ionode_bitmap);

			/* check and see if the bits set are a valid
			   combo */
			if (bit_itr) {
				while ((bitstr = list_next(bit_itr))) {
					if (bit_super_set(ionodes, bitstr))
						break;
				}
				list_iterator_reset(bit_itr);
			}
			if (!bitstr) {
				bit_nclear(ionodes, 0,
					   (bg_conf->ionodes_per_mp-1));
				bit_or(ionodes, bg_record->ionode_bitmap);
				total_cnode_cnt = num_cnodes =
					bg_record->cnode_cnt;
			} else
				total_cnode_cnt += num_cnodes;

			bit_fmt(bitstring, BITSIZE, ionodes);
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("combine adding %s %s %d got %d set "
				     "ionodes %s total is %s",
				     bg_record->bg_block_id, bg_record->mp_str,
				     num_cnodes, total_cnode_cnt,
				     bg_record->ionode_str, bitstring);
			if (total_cnode_cnt == cnodes) {
				ba_mp_t *ba_mp = NULL;
				if (bg_record->ba_mp_list)
					ba_mp = list_peek(
						bg_record->ba_mp_list);
				if (!ba_mp) {
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++)
						start_char[dim] = alpha_num[
							bg_record->start[dim]];
					start_char[dim] = '\0';
					request->save_name =
						xstrdup(start_char);
				} else
					request->save_name =
						xstrdup(ba_mp->coord_str);

				if (!my_block_list) {
					rc = SLURM_SUCCESS;
					goto finished;
				}

				bg_record = create_small_record(bg_record,
								ionodes,
								cnodes);
				list_append(new_blocks, bg_record);

				rc = SLURM_SUCCESS;
				goto finished;
			}
			continue;
		}
		/* we found a block that is bigger than requested */
		break;
	}

	if (bg_record) {
		ba_mp_t *ba_mp = NULL;
		if (bg_record->ba_mp_list)
			ba_mp = list_peek(bg_record->ba_mp_list);
		if (!ba_mp) {
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				start_char[dim] = alpha_num[
					bg_record->start[dim]];
			start_char[dim] = '\0';
			request->save_name = xstrdup(start_char);
		} else
			request->save_name = xstrdup(ba_mp->coord_str);
		/* It appears we don't need this original record
		 * anymore, just work off the copy if indeed it is a copy. */

		/* bg_record_t *found_record = NULL; */

		/* if (bg_record->original) { */
		/* 	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */
		/* 		info("1 This was a copy %s", */
		/* 		     bg_record->bg_block_id); */
		/* 	found_record = bg_record->original; */
		/* } else { */
		/* 	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */
		/* 		info("looking for original"); */
		/* 	found_record = find_org_in_bg_list( */
		/* 		bg_lists->main, bg_record); */
		/* } */
		if ((bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		    && bg_record->original
		    && (bg_record->original->magic != BLOCK_MAGIC)) {
			info("This record %s has bad magic, it must be "
			     "getting freed.  No worries it will all be "
			     "figured out later.",
			     bg_record->bg_block_id);
		}

		/* if (!found_record || found_record->magic != BLOCK_MAGIC) { */
		/* 	error("this record wasn't found in the list!"); */
		/* 	rc = SLURM_ERROR; */
		/* 	goto finished; */
		/* } */

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
			char tmp_char[256];
			format_node_name(bg_record, tmp_char,
					 sizeof(tmp_char));
			info("going to split %s, %s",
			     bg_record->bg_block_id,
			     tmp_char);
		}

		if (!my_block_list) {
			rc = SLURM_SUCCESS;
			goto finished;
		}
		_split_block(block_list, new_blocks, bg_record, cnodes);
		rc = SLURM_SUCCESS;
		goto finished;
	}

finished:
	if (bit_itr)
		list_iterator_destroy(bit_itr);

	FREE_NULL_BITMAP(ionodes);
	if (itr)
		list_iterator_destroy(itr);

	return rc;
}
示例#3
0
/*
 * finds the best match for a given job request
 *
 *
 * OUT - block_id of matched block, NULL otherwise
 * returns 1 for error (no match)
 *
 */
static int _find_best_block_match(List block_list,
				  int *blocks_added,
				  struct job_record* job_ptr,
				  bitstr_t* slurm_block_bitmap,
				  uint32_t min_nodes, uint32_t max_nodes,
				  uint32_t req_nodes,
				  bg_record_t** found_bg_record,
				  uint16_t query_mode, int avail_cpus)
{
	bg_record_t *bg_record = NULL;
	uint16_t req_geometry[SYSTEM_DIMENSIONS];
	uint16_t target_size = 0;
	uint32_t req_procs = job_ptr->details->min_cpus;
	select_ba_request_t request;
	int i, dim;
	int overlap_check = 0;
	int allow = 0;
	int check_image = 1;
	uint32_t max_cpus = job_ptr->details->max_cpus;
	char tmp_char[256];
	static int total_cpus = 0;
	int rc = SLURM_SUCCESS;
	int create_try = 0;
	List overlapped_list = NULL;
	bool is_test = SELECT_IS_TEST(query_mode);

	if (!total_cpus) {
		int *cluster_dims = select_g_ba_get_dims();
		total_cpus = 1;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			total_cpus *= cluster_dims[dim];
		total_cpus *= bg_conf->cpus_per_mp;
	}

	if (req_nodes > max_nodes) {
		error("can't run this job max mps is %u asking for %u",
		      max_nodes, req_nodes);
		return SLURM_ERROR;
	}

	if (!is_test && (req_procs > avail_cpus)) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("asking for %u I only have %d",
			     req_procs, avail_cpus);
		return SLURM_ERROR;
	}

	if (!block_list) {
		error("_find_best_block_match: There is no block_list");
		return SLURM_ERROR;
	}

	memset(&request, 0, sizeof(select_ba_request_t));

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE, &request.conn_type);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_GEOMETRY, &req_geometry);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_ROTATE, &request.rotate);

	if ((rc = _check_images(job_ptr, &request)) == SLURM_ERROR)
		goto end_it;

	if (req_geometry[0] != 0 && req_geometry[0] != (uint16_t)NO_VAL) {
		char tmp_geo[SYSTEM_DIMENSIONS+1];

		target_size = 1;
		for (i=0; i<SYSTEM_DIMENSIONS; i++) {
			target_size *= req_geometry[i];
			tmp_geo[i] = alpha_num[req_geometry[i]];
		}
		tmp_geo[i] = '\0';

		if (target_size != min_nodes) {
			debug2("min_nodes not set correctly %u "
			       "should be %u from %s",
			       min_nodes, target_size,
			       tmp_geo);
			min_nodes = target_size;
		}
		if (!req_nodes)
			req_nodes = min_nodes;
	} else {
		req_geometry[0] = (uint16_t)NO_VAL;
		target_size = min_nodes;
	}

	*found_bg_record = NULL;
	allow = 0;

	memcpy(request.geometry, req_geometry, sizeof(req_geometry));

	request.deny_pass = (uint16_t)NO_VAL;
	request.save_name = NULL;
	request.size = target_size;
	request.procs = req_procs;
	request.elongate = request.rotate;
	/* request.start[0] = 1; */
	/* request.start[1] = 2; */
	/* request.start[2] = 0; */
	/* request.start[3] = 2; */
	/* request.start_req = 1; */

	if (job_ptr->details->req_node_bitmap)
		request.avail_mp_bitmap = job_ptr->details->req_node_bitmap;
	else
		request.avail_mp_bitmap = slurm_block_bitmap;

	/* since we only look at procs after this and not nodes we
	 *  need to set a max_cpus if given
	 */
	if (max_cpus == (uint32_t)NO_VAL)
		max_cpus = max_nodes * bg_conf->cpus_per_mp;

	while (1) {
		/* Here we are creating a list of all the blocks that
		 * have overlapped jobs so if we don't find one that
		 * works we will have can look and see the earliest
		 * the job can start.  This doesn't apply to Dynamic mode.
		 */
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)
		    && bg_conf->layout_mode != LAYOUT_DYNAMIC)
			overlapped_list = list_create(NULL);

		bg_record = _find_matching_block(block_list,
						 job_ptr,
						 slurm_block_bitmap,
						 &request,
						 max_cpus,
						 &allow, check_image,
						 overlap_check,
						 overlapped_list,
						 query_mode);
		/* this could get altered in _find_matching_block so we
		   need to reset it */
		memcpy(request.geometry, req_geometry, sizeof(req_geometry));

		if (!bg_record && overlapped_list
		    && list_count(overlapped_list)) {
			ListIterator itr =
				list_iterator_create(overlapped_list);
			bg_record_t *tmp_rec = NULL;
			while ((tmp_rec = list_next(itr))) {
				if (!bg_record ||
				    (tmp_rec->job_ptr->end_time <
				     bg_record->job_ptr->end_time))
					bg_record = tmp_rec;
			}
			list_iterator_destroy(itr);
		}

		if (overlapped_list)
			list_destroy(overlapped_list);

		/* set the bitmap and do other allocation activities */
		if (bg_record) {
#ifdef HAVE_BG_L_P
			if (!is_test) {
				if (bridge_block_check_mp_states(
					    bg_record->bg_block_id, 1)
				    != SLURM_SUCCESS) {
					/* check_block_mp_states will
					   set this block in the main
					   list to an error state, but
					   we aren't looking
					   at the main list, so we
					   need to set this copy of
					   the block in an
					   error state as well.
					*/
					bg_record->job_running =
						BLOCK_ERROR_STATE;
					bg_record->state |= BG_BLOCK_ERROR_FLAG;
					error("_find_best_block_match: Picked "
					      "block (%s) had some issues with "
					      "hardware, trying a different "
					      "one.",
					      bg_record->bg_block_id);
					continue;
				}
			}
#endif
			format_node_name(bg_record, tmp_char, sizeof(tmp_char));

			debug("_find_best_block_match %s <%s>",
			      bg_record->bg_block_id, tmp_char);
			bit_and(slurm_block_bitmap, bg_record->mp_bitmap);
			rc = SLURM_SUCCESS;
			*found_bg_record = bg_record;
			goto end_it;
		}

		/* see if we can just reset the image and reboot the block */
		if (allow) {
			check_image = 0;
			allow = 0;
			continue;
		}

		check_image = 1;

		/* all these assume that the *bg_record is NULL */

		if (bg_conf->layout_mode == LAYOUT_OVERLAP
		    && !is_test && overlap_check < 2) {
			overlap_check++;
			continue;
		}

		if (create_try || bg_conf->layout_mode != LAYOUT_DYNAMIC)
			goto no_match;

		if ((rc = _dynamically_request(block_list, blocks_added,
					       &request,
					       job_ptr->details->req_nodes,
					       query_mode))
		    == SLURM_SUCCESS) {
			create_try = 1;
			continue;
		}

		/* Only look at the full system if we aren't going to
		   preempt jobs later and look.
		*/
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)) {
			List new_blocks = NULL;
			List job_list = list_create(NULL);
			ListIterator itr = NULL;
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("trying with empty machine");

			/* Here we need to make sure the blocks in the
			   job list are those in the block list so go
			   through and grab them and add them to a
			   separate list.
			*/
			itr = list_iterator_create(block_list);
			while ((bg_record = list_next(itr))) {
				if (bg_record->job_running != NO_JOB_RUNNING)
					list_append(job_list, bg_record);
				/* Since the error blocks are at the
				   end we only really need to look at
				   the first one to make sure it will
				   work, so don't add more than one to
				   the job list.
				   We do need to check for at least
				   one error block because that lets
				   us know not to hold up the entire
				   machine for a job that won't run
				   until the error is removed which
				   could be a very long time.
				*/
				if (bg_record->job_running == BLOCK_ERROR_STATE)
					break;
			}
			list_iterator_destroy(itr);

			/* Block list is already in the correct order,
			   earliest avaliable first,
			   so the job list will also be. No need to
			   sort. */
			while (1) {
				bool track_down_nodes = true;

				if ((bg_record = list_pop(job_list))) {
					if (bg_record->job_ptr) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("taking off "
							     "%d(%s) started "
							     "at %ld ends "
							     "at %ld",
							     bg_record->
							     job_running,
							     bg_record->
							     bg_block_id,
							     bg_record->
							     job_ptr->
							     start_time,
							     bg_record->
							     job_ptr->
							     end_time);
						/* Mark the block as
						   not running a job,
						   this should
						   correspond to the
						   pointer in the
						   block_list.  We
						   only look at the
						   job_running var so
						   don't remove the
						   job_ptr.
						*/
						bg_record->job_running =
							NO_JOB_RUNNING;
					} else if ((bg_record->job_running
						    == BLOCK_ERROR_STATE)
						   && (bg_conf->
						       slurm_debug_flags
						       & DEBUG_FLAG_BG_PICK))
						info("taking off (%s) "
						     "which is in an "
						     "error state",
						     bg_record->bg_block_id);
				} else
					/* This means we didn't have
					   any jobs to take off
					   anymore so we are making
					   sure we can look at every
					   node on the system.
					*/
					track_down_nodes = false;

				if (!(new_blocks = create_dynamic_block(
					      block_list, &request, job_list,
					      track_down_nodes))) {
					if (errno == ESLURM_INTERCONNECT_FAILURE
					    || !list_count(job_list)) {
						char *nodes;
						if (slurmctld_conf.
						    slurmctld_debug < 5)
							break;
						nodes = bitmap2node_name(
							slurm_block_bitmap);
						debug("job %u not "
						      "runable on %s",
						      job_ptr->job_id,
						      nodes);
						xfree(nodes);
						break;
					}
					continue;
				}
				rc = SLURM_SUCCESS;
				/* outside of the job_test_list this
				 * gets destroyed later, so don't worry
				 * about it now
				 */
				(*found_bg_record) = list_pop(new_blocks);
				if (!(*found_bg_record)) {
					list_destroy(new_blocks);
					if (!bg_record) {
						/* This should never happen */
						error("got an empty list back");
						rc = SLURM_ERROR;
						break;
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Appears we are trying "
						     "to place this job on "
						     "the block we just "
						     "removed %s.",
						     bg_record->bg_block_id);
					/* This means we placed the job on
					   the block we just popped off.
					*/
					bit_and(slurm_block_bitmap,
						bg_record->mp_bitmap);
					*found_bg_record = bg_record;
					break;
				}
				bit_and(slurm_block_bitmap,
					(*found_bg_record)->mp_bitmap);

				if (bg_record) {
					(*found_bg_record)->job_running =
						bg_record->job_running;
					(*found_bg_record)->job_ptr
						= bg_record->job_ptr;
				}
				list_destroy(new_blocks);
				break;
			}

			list_destroy(job_list);

			goto end_it;
		} else {
			break;
		}
	}

no_match:
	debug("_find_best_block_match none found");
	rc = SLURM_ERROR;

end_it:

	xfree(request.blrtsimage);
	xfree(request.linuximage);
	xfree(request.mloaderimage);
	xfree(request.ramdiskimage);

	return rc;
}