extern void print_bg_record(bg_record_t* bg_record) { if (!bg_record) { error("print_bg_record, record given is null"); return; } #if _DEBUG info(" bg_record: "); if (bg_record->bg_block_id) info("\tbg_block_id: %s", bg_record->bg_block_id); info("\tnodes: %s", bg_record->mp_str); info("\tsize: %d MPs %u Nodes %d cpus", bg_record->mp_count, bg_record->cnode_cnt, bg_record->cpu_cnt); info("\tgeo: %ux%ux%u", bg_record->geo[X], bg_record->geo[Y], bg_record->geo[Z]); info("\tconn_type: %s", conn_type_string(bg_record->conn_type[0])); #ifdef HAVE_BGL info("\tnode_use: %s", node_use_string(bg_record->node_use)); #endif if (bg_record->mp_bitmap) { char bitstring[BITSIZE]; bit_fmt(bitstring, BITSIZE, bg_record->mp_bitmap); info("\tbitmap: %s", bitstring); } #else { char tmp_char[256]; format_node_name(bg_record, tmp_char, sizeof(tmp_char)); info("Record: BlockID:%s Nodes:%s Conn:%s", bg_record->bg_block_id, tmp_char, conn_type_string(bg_record->conn_type[0])); } #endif }
static int _breakup_blocks(List block_list, List new_blocks, select_ba_request_t *request, List my_block_list, bool only_free, bool only_small) { int rc = SLURM_ERROR; bg_record_t *bg_record = NULL; ListIterator itr = NULL, bit_itr = NULL; int total_cnode_cnt=0; char start_char[SYSTEM_DIMENSIONS+1]; bitstr_t *ionodes = bit_alloc(bg_conf->ionodes_per_mp); int cnodes = request->procs / bg_conf->cpu_ratio; int curr_mp_bit = -1; int dim; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("cpu_count=%d cnodes=%d o_free=%d o_small=%d", request->procs, cnodes, only_free, only_small); switch(cnodes) { case 16: /* a 16 can go anywhere */ break; case 32: bit_itr = list_iterator_create(bg_lists->valid_small32); break; case 64: bit_itr = list_iterator_create(bg_lists->valid_small64); break; case 128: bit_itr = list_iterator_create(bg_lists->valid_small128); break; case 256: bit_itr = list_iterator_create(bg_lists->valid_small256); break; default: error("We shouldn't be here with this size %d", cnodes); goto finished; break; } /* First try with free blocks a midplane or less. Then try with the * smallest blocks. */ itr = list_iterator_create(block_list); while ((bg_record = list_next(itr))) { if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("%s being freed by other job(s), skipping", bg_record->bg_block_id); continue; } /* never look at a block if a job is running */ if (bg_record->job_running != NO_JOB_RUNNING) continue; /* on the third time through look for just a block * that isn't used */ /* check for free blocks on the first and second time */ if (only_free && (bg_record->state != BG_BLOCK_FREE)) continue; /* check small blocks first */ if (only_small && (bg_record->cnode_cnt > bg_conf->mp_cnode_cnt)) continue; if (request->avail_mp_bitmap && !bit_super_set(bg_record->bitmap, request->avail_mp_bitmap)) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("bg block %s has nodes not usable " "by this job", bg_record->bg_block_id); continue; } if (bg_record->cnode_cnt == cnodes) { ba_mp_t *ba_mp = NULL; if (bg_record->ba_mp_list) ba_mp = list_peek(bg_record->ba_mp_list); if (!ba_mp) { for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) start_char[dim] = alpha_num[ bg_record->start[dim]]; start_char[dim] = '\0'; request->save_name = xstrdup(start_char); } else request->save_name = xstrdup(ba_mp->coord_str); rc = SLURM_SUCCESS; goto finished; } /* lets see if we can combine some small ones */ if (bg_record->cnode_cnt < cnodes) { char bitstring[BITSIZE]; bitstr_t *bitstr = NULL; int num_over = 0; int num_cnodes = bg_record->cnode_cnt; int rec_mp_bit = bit_ffs(bg_record->bitmap); if (curr_mp_bit != rec_mp_bit) { /* Got a different node than * previously, since the list should * be in order of nodes for small blocks * just clear here since the last node * doesn't have any more. */ curr_mp_bit = rec_mp_bit; bit_nclear(ionodes, 0, (bg_conf->ionodes_per_mp-1)); total_cnode_cnt = 0; } /* On really busy systems we can get overlapping blocks here. If that is the case only add that which doesn't overlap. */ if ((num_over = bit_overlap( ionodes, bg_record->ionode_bitmap))) { /* Since the smallest block size is the number of cnodes in an io node, just multiply the num_over by that to get the number of cnodes to remove. */ if ((num_cnodes -= num_over * bg_conf->smallest_block) <= 0) continue; } bit_or(ionodes, bg_record->ionode_bitmap); /* check and see if the bits set are a valid combo */ if (bit_itr) { while ((bitstr = list_next(bit_itr))) { if (bit_super_set(ionodes, bitstr)) break; } list_iterator_reset(bit_itr); } if (!bitstr) { bit_nclear(ionodes, 0, (bg_conf->ionodes_per_mp-1)); bit_or(ionodes, bg_record->ionode_bitmap); total_cnode_cnt = num_cnodes = bg_record->cnode_cnt; } else total_cnode_cnt += num_cnodes; bit_fmt(bitstring, BITSIZE, ionodes); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("combine adding %s %s %d got %d set " "ionodes %s total is %s", bg_record->bg_block_id, bg_record->mp_str, num_cnodes, total_cnode_cnt, bg_record->ionode_str, bitstring); if (total_cnode_cnt == cnodes) { ba_mp_t *ba_mp = NULL; if (bg_record->ba_mp_list) ba_mp = list_peek( bg_record->ba_mp_list); if (!ba_mp) { for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) start_char[dim] = alpha_num[ bg_record->start[dim]]; start_char[dim] = '\0'; request->save_name = xstrdup(start_char); } else request->save_name = xstrdup(ba_mp->coord_str); if (!my_block_list) { rc = SLURM_SUCCESS; goto finished; } bg_record = create_small_record(bg_record, ionodes, cnodes); list_append(new_blocks, bg_record); rc = SLURM_SUCCESS; goto finished; } continue; } /* we found a block that is bigger than requested */ break; } if (bg_record) { ba_mp_t *ba_mp = NULL; if (bg_record->ba_mp_list) ba_mp = list_peek(bg_record->ba_mp_list); if (!ba_mp) { for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) start_char[dim] = alpha_num[ bg_record->start[dim]]; start_char[dim] = '\0'; request->save_name = xstrdup(start_char); } else request->save_name = xstrdup(ba_mp->coord_str); /* It appears we don't need this original record * anymore, just work off the copy if indeed it is a copy. */ /* bg_record_t *found_record = NULL; */ /* if (bg_record->original) { */ /* if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */ /* info("1 This was a copy %s", */ /* bg_record->bg_block_id); */ /* found_record = bg_record->original; */ /* } else { */ /* if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */ /* info("looking for original"); */ /* found_record = find_org_in_bg_list( */ /* bg_lists->main, bg_record); */ /* } */ if ((bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) && bg_record->original && (bg_record->original->magic != BLOCK_MAGIC)) { info("This record %s has bad magic, it must be " "getting freed. No worries it will all be " "figured out later.", bg_record->bg_block_id); } /* if (!found_record || found_record->magic != BLOCK_MAGIC) { */ /* error("this record wasn't found in the list!"); */ /* rc = SLURM_ERROR; */ /* goto finished; */ /* } */ if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { char tmp_char[256]; format_node_name(bg_record, tmp_char, sizeof(tmp_char)); info("going to split %s, %s", bg_record->bg_block_id, tmp_char); } if (!my_block_list) { rc = SLURM_SUCCESS; goto finished; } _split_block(block_list, new_blocks, bg_record, cnodes); rc = SLURM_SUCCESS; goto finished; } finished: if (bit_itr) list_iterator_destroy(bit_itr); FREE_NULL_BITMAP(ionodes); if (itr) list_iterator_destroy(itr); return rc; }
/* * finds the best match for a given job request * * * OUT - block_id of matched block, NULL otherwise * returns 1 for error (no match) * */ static int _find_best_block_match(List block_list, int *blocks_added, struct job_record* job_ptr, bitstr_t* slurm_block_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bg_record_t** found_bg_record, uint16_t query_mode, int avail_cpus) { bg_record_t *bg_record = NULL; uint16_t req_geometry[SYSTEM_DIMENSIONS]; uint16_t target_size = 0; uint32_t req_procs = job_ptr->details->min_cpus; select_ba_request_t request; int i, dim; int overlap_check = 0; int allow = 0; int check_image = 1; uint32_t max_cpus = job_ptr->details->max_cpus; char tmp_char[256]; static int total_cpus = 0; int rc = SLURM_SUCCESS; int create_try = 0; List overlapped_list = NULL; bool is_test = SELECT_IS_TEST(query_mode); if (!total_cpus) { int *cluster_dims = select_g_ba_get_dims(); total_cpus = 1; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) total_cpus *= cluster_dims[dim]; total_cpus *= bg_conf->cpus_per_mp; } if (req_nodes > max_nodes) { error("can't run this job max mps is %u asking for %u", max_nodes, req_nodes); return SLURM_ERROR; } if (!is_test && (req_procs > avail_cpus)) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("asking for %u I only have %d", req_procs, avail_cpus); return SLURM_ERROR; } if (!block_list) { error("_find_best_block_match: There is no block_list"); return SLURM_ERROR; } memset(&request, 0, sizeof(select_ba_request_t)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &request.conn_type); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_GEOMETRY, &req_geometry); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_ROTATE, &request.rotate); if ((rc = _check_images(job_ptr, &request)) == SLURM_ERROR) goto end_it; if (req_geometry[0] != 0 && req_geometry[0] != (uint16_t)NO_VAL) { char tmp_geo[SYSTEM_DIMENSIONS+1]; target_size = 1; for (i=0; i<SYSTEM_DIMENSIONS; i++) { target_size *= req_geometry[i]; tmp_geo[i] = alpha_num[req_geometry[i]]; } tmp_geo[i] = '\0'; if (target_size != min_nodes) { debug2("min_nodes not set correctly %u " "should be %u from %s", min_nodes, target_size, tmp_geo); min_nodes = target_size; } if (!req_nodes) req_nodes = min_nodes; } else { req_geometry[0] = (uint16_t)NO_VAL; target_size = min_nodes; } *found_bg_record = NULL; allow = 0; memcpy(request.geometry, req_geometry, sizeof(req_geometry)); request.deny_pass = (uint16_t)NO_VAL; request.save_name = NULL; request.size = target_size; request.procs = req_procs; request.elongate = request.rotate; /* request.start[0] = 1; */ /* request.start[1] = 2; */ /* request.start[2] = 0; */ /* request.start[3] = 2; */ /* request.start_req = 1; */ if (job_ptr->details->req_node_bitmap) request.avail_mp_bitmap = job_ptr->details->req_node_bitmap; else request.avail_mp_bitmap = slurm_block_bitmap; /* since we only look at procs after this and not nodes we * need to set a max_cpus if given */ if (max_cpus == (uint32_t)NO_VAL) max_cpus = max_nodes * bg_conf->cpus_per_mp; while (1) { /* Here we are creating a list of all the blocks that * have overlapped jobs so if we don't find one that * works we will have can look and see the earliest * the job can start. This doesn't apply to Dynamic mode. */ if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode) && bg_conf->layout_mode != LAYOUT_DYNAMIC) overlapped_list = list_create(NULL); bg_record = _find_matching_block(block_list, job_ptr, slurm_block_bitmap, &request, max_cpus, &allow, check_image, overlap_check, overlapped_list, query_mode); /* this could get altered in _find_matching_block so we need to reset it */ memcpy(request.geometry, req_geometry, sizeof(req_geometry)); if (!bg_record && overlapped_list && list_count(overlapped_list)) { ListIterator itr = list_iterator_create(overlapped_list); bg_record_t *tmp_rec = NULL; while ((tmp_rec = list_next(itr))) { if (!bg_record || (tmp_rec->job_ptr->end_time < bg_record->job_ptr->end_time)) bg_record = tmp_rec; } list_iterator_destroy(itr); } if (overlapped_list) list_destroy(overlapped_list); /* set the bitmap and do other allocation activities */ if (bg_record) { #ifdef HAVE_BG_L_P if (!is_test) { if (bridge_block_check_mp_states( bg_record->bg_block_id, 1) != SLURM_SUCCESS) { /* check_block_mp_states will set this block in the main list to an error state, but we aren't looking at the main list, so we need to set this copy of the block in an error state as well. */ bg_record->job_running = BLOCK_ERROR_STATE; bg_record->state |= BG_BLOCK_ERROR_FLAG; error("_find_best_block_match: Picked " "block (%s) had some issues with " "hardware, trying a different " "one.", bg_record->bg_block_id); continue; } } #endif format_node_name(bg_record, tmp_char, sizeof(tmp_char)); debug("_find_best_block_match %s <%s>", bg_record->bg_block_id, tmp_char); bit_and(slurm_block_bitmap, bg_record->mp_bitmap); rc = SLURM_SUCCESS; *found_bg_record = bg_record; goto end_it; } /* see if we can just reset the image and reboot the block */ if (allow) { check_image = 0; allow = 0; continue; } check_image = 1; /* all these assume that the *bg_record is NULL */ if (bg_conf->layout_mode == LAYOUT_OVERLAP && !is_test && overlap_check < 2) { overlap_check++; continue; } if (create_try || bg_conf->layout_mode != LAYOUT_DYNAMIC) goto no_match; if ((rc = _dynamically_request(block_list, blocks_added, &request, job_ptr->details->req_nodes, query_mode)) == SLURM_SUCCESS) { create_try = 1; continue; } /* Only look at the full system if we aren't going to preempt jobs later and look. */ if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)) { List new_blocks = NULL; List job_list = list_create(NULL); ListIterator itr = NULL; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("trying with empty machine"); /* Here we need to make sure the blocks in the job list are those in the block list so go through and grab them and add them to a separate list. */ itr = list_iterator_create(block_list); while ((bg_record = list_next(itr))) { if (bg_record->job_running != NO_JOB_RUNNING) list_append(job_list, bg_record); /* Since the error blocks are at the end we only really need to look at the first one to make sure it will work, so don't add more than one to the job list. We do need to check for at least one error block because that lets us know not to hold up the entire machine for a job that won't run until the error is removed which could be a very long time. */ if (bg_record->job_running == BLOCK_ERROR_STATE) break; } list_iterator_destroy(itr); /* Block list is already in the correct order, earliest avaliable first, so the job list will also be. No need to sort. */ while (1) { bool track_down_nodes = true; if ((bg_record = list_pop(job_list))) { if (bg_record->job_ptr) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("taking off " "%d(%s) started " "at %ld ends " "at %ld", bg_record-> job_running, bg_record-> bg_block_id, bg_record-> job_ptr-> start_time, bg_record-> job_ptr-> end_time); /* Mark the block as not running a job, this should correspond to the pointer in the block_list. We only look at the job_running var so don't remove the job_ptr. */ bg_record->job_running = NO_JOB_RUNNING; } else if ((bg_record->job_running == BLOCK_ERROR_STATE) && (bg_conf-> slurm_debug_flags & DEBUG_FLAG_BG_PICK)) info("taking off (%s) " "which is in an " "error state", bg_record->bg_block_id); } else /* This means we didn't have any jobs to take off anymore so we are making sure we can look at every node on the system. */ track_down_nodes = false; if (!(new_blocks = create_dynamic_block( block_list, &request, job_list, track_down_nodes))) { if (errno == ESLURM_INTERCONNECT_FAILURE || !list_count(job_list)) { char *nodes; if (slurmctld_conf. slurmctld_debug < 5) break; nodes = bitmap2node_name( slurm_block_bitmap); debug("job %u not " "runable on %s", job_ptr->job_id, nodes); xfree(nodes); break; } continue; } rc = SLURM_SUCCESS; /* outside of the job_test_list this * gets destroyed later, so don't worry * about it now */ (*found_bg_record) = list_pop(new_blocks); if (!(*found_bg_record)) { list_destroy(new_blocks); if (!bg_record) { /* This should never happen */ error("got an empty list back"); rc = SLURM_ERROR; break; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("Appears we are trying " "to place this job on " "the block we just " "removed %s.", bg_record->bg_block_id); /* This means we placed the job on the block we just popped off. */ bit_and(slurm_block_bitmap, bg_record->mp_bitmap); *found_bg_record = bg_record; break; } bit_and(slurm_block_bitmap, (*found_bg_record)->mp_bitmap); if (bg_record) { (*found_bg_record)->job_running = bg_record->job_running; (*found_bg_record)->job_ptr = bg_record->job_ptr; } list_destroy(new_blocks); break; } list_destroy(job_list); goto end_it; } else { break; } } no_match: debug("_find_best_block_match none found"); rc = SLURM_ERROR; end_it: xfree(request.blrtsimage); xfree(request.linuximage); xfree(request.mloaderimage); xfree(request.ramdiskimage); return rc; }