/* * create_dynamic_block - create new block(s) to be used for a new * job allocation. * RET - a list of created block(s) or NULL on failure errno is set. */ extern List create_dynamic_block(List block_list, select_ba_request_t *request, List my_block_list, bool track_down_nodes) { int rc = SLURM_SUCCESS; ListIterator itr, itr2; bg_record_t *bg_record = NULL, *found_record = NULL; List results = NULL; List new_blocks = NULL; bitstr_t *my_bitmap = NULL; select_ba_request_t blockreq; int cnodes = request->procs / bg_conf->cpu_ratio; uint16_t start_geo[SYSTEM_DIMENSIONS]; if (cnodes < bg_conf->smallest_block) { error("Can't create this size %d " "on this system ionodes_per_mp is %d", request->procs, bg_conf->ionodes_per_mp); goto finished; } memset(&blockreq, 0, sizeof(select_ba_request_t)); memcpy(start_geo, request->geometry, sizeof(start_geo)); /* We need to lock this just incase a blocks_overlap is called which will in turn reset and set the system as it sees fit. */ slurm_mutex_lock(&block_state_mutex); if (my_block_list) { reset_ba_system(track_down_nodes); itr = list_iterator_create(my_block_list); while ((bg_record = list_next(itr))) { if (bg_record->magic != BLOCK_MAGIC) { /* This should never happen since we only call this on copies of blocks and we check on this during the copy. */ error("create_dynamic_block: " "got a block with bad magic?"); continue; } if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("not adding %s(%s) %s %s %s %u " "(free_cnt)", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } continue; } if (!my_bitmap) { my_bitmap = bit_alloc(bit_size(bg_record->bitmap)); } if (!bit_super_set(bg_record->bitmap, my_bitmap)) { bit_or(my_bitmap, bg_record->bitmap); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("adding %s(%s) %s %s %s %u", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } if (check_and_set_mp_list( bg_record->ba_mp_list) == SLURM_ERROR) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("something happened in " "the load of %s", bg_record->bg_block_id); list_iterator_destroy(itr); FREE_NULL_BITMAP(my_bitmap); rc = SLURM_ERROR; goto finished; } } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("not adding %s(%s) %s %s %s %u ", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } /* just so we don't look at it later */ bg_record->free_cnt = -1; } } list_iterator_destroy(itr); FREE_NULL_BITMAP(my_bitmap); } else { reset_ba_system(false); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("No list was given"); } if (request->avail_mp_bitmap) ba_set_removable_mps(request->avail_mp_bitmap, 1); if (request->size==1 && cnodes < bg_conf->mp_cnode_cnt) { switch(cnodes) { #ifdef HAVE_BGL case 32: blockreq.small32 = 4; blockreq.small128 = 3; break; case 128: blockreq.small128 = 4; break; #else case 16: blockreq.small16 = 2; blockreq.small32 = 1; blockreq.small64 = 1; blockreq.small128 = 1; blockreq.small256 = 1; break; case 32: blockreq.small32 = 2; blockreq.small64 = 1; blockreq.small128 = 1; blockreq.small256 = 1; break; case 64: blockreq.small64 = 2; blockreq.small128 = 1; blockreq.small256 = 1; break; case 128: blockreq.small128 = 2; blockreq.small256 = 1; break; case 256: blockreq.small256 = 2; break; #endif default: error("This size %d is unknown on this system", cnodes); goto finished; break; } /* Sort the list so the small blocks are in the order * of ionodes. */ list_sort(block_list, (ListCmpF)bg_record_cmpf_inc); request->conn_type[0] = SELECT_SMALL; new_blocks = list_create(destroy_bg_record); /* check only blocks that are free and small */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, true, true) == SLURM_SUCCESS) goto finished; /* check only blocks that are free and any size */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, true, false) == SLURM_SUCCESS) goto finished; /* check usable blocks that are small with any state */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, false, true) == SLURM_SUCCESS) goto finished; /* check all usable blocks */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, false, false) == SLURM_SUCCESS) goto finished; /* Re-sort the list back to the original order. */ list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); list_destroy(new_blocks); new_blocks = NULL; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("small block not able to be placed inside others"); } if (request->conn_type[0] == SELECT_NAV) request->conn_type[0] = SELECT_TORUS; //debug("going to create %d", request->size); if (!new_ba_request(request)) { if (request->geometry[0] != (uint16_t)NO_VAL) { char *geo = give_geo(request->geometry); error("Problems with request for size %d geo %s", request->size, geo); xfree(geo); } else { error("Problems with request for size %d. " "No geo given.", request->size); } rc = ESLURM_INTERCONNECT_FAILURE; goto finished; } /* try on free midplanes */ rc = SLURM_SUCCESS; if (results) list_flush(results); else { #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif } rc = allocate_block(request, results); /* This could be changed in allocate_block so set it back up */ memcpy(request->geometry, start_geo, sizeof(start_geo)); if (rc) { rc = SLURM_SUCCESS; goto setup_records; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("allocate failure for size %d base " "partitions of free midplanes", request->size); rc = SLURM_ERROR; if (!list_count(my_block_list) || !my_block_list) goto finished; /*Try to put block starting in the smallest of the exisiting blocks*/ itr = list_iterator_create(my_block_list); itr2 = list_iterator_create(my_block_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { bool is_small = 0; /* never check a block with a job running */ if (bg_record->free_cnt || bg_record->job_running != NO_JOB_RUNNING) continue; /* Here we are only looking for the first block on the midplane. So either the count is greater or equal than bg_conf->mp_cnode_cnt or the first bit is set in the ionode_bitmap. */ if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) { bool found = 0; if (bit_ffs(bg_record->ionode_bitmap) != 0) continue; /* Check to see if we have other blocks in this midplane that have jobs running. */ while ((found_record = list_next(itr2))) { if (!found_record->free_cnt && (found_record->job_running != NO_JOB_RUNNING) && bit_overlap(bg_record->bitmap, found_record->bitmap)) { found = 1; break; } } list_iterator_reset(itr2); if (found) continue; is_small = 1; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("removing %s(%s) for request %d", bg_record->bg_block_id, bg_record->mp_str, request->size); remove_block(bg_record->ba_mp_list, is_small); rc = SLURM_SUCCESS; if (results) list_flush(results); else { #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif } rc = allocate_block(request, results); /* This could be changed in allocate_block so set it back up */ memcpy(request->geometry, start_geo, sizeof(start_geo)); if (rc) { rc = SLURM_SUCCESS; break; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("allocate failure for size %d base partitions", request->size); rc = SLURM_ERROR; } list_iterator_destroy(itr); list_iterator_destroy(itr2); setup_records: if (rc == SLURM_SUCCESS) { /*set up bg_record(s) here */ new_blocks = list_create(destroy_bg_record); blockreq.save_name = request->save_name; #ifdef HAVE_BGL blockreq.blrtsimage = request->blrtsimage; #endif blockreq.linuximage = request->linuximage; blockreq.mloaderimage = request->mloaderimage; blockreq.ramdiskimage = request->ramdiskimage; memcpy(blockreq.conn_type, request->conn_type, sizeof(blockreq.conn_type)); add_bg_record(new_blocks, &results, &blockreq, 0, 0); } finished: if (request->avail_mp_bitmap && (bit_ffc(request->avail_mp_bitmap) == -1)) ba_reset_all_removed_mps(); slurm_mutex_unlock(&block_state_mutex); /* reset the ones we mucked with */ itr = list_iterator_create(my_block_list); while ((bg_record = (bg_record_t *) list_next(itr))) { if (bg_record->free_cnt == -1) bg_record->free_cnt = 0; } list_iterator_destroy(itr); xfree(request->save_name); if (results) list_destroy(results); errno = rc; return new_blocks; }
/* * create_defined_blocks - create the static blocks that will be used * for scheduling, all partitions must be able to be created and booted * at once. * IN - int overlapped, 1 if partitions are to be overlapped, 0 if they are * static. * RET - success of fitting all configurations */ extern int create_defined_blocks(bg_layout_t overlapped, List bg_found_block_list) { int rc = SLURM_SUCCESS; ListIterator itr; bg_record_t *bg_record = NULL; int i; uint16_t geo[SYSTEM_DIMENSIONS]; char temp[256]; struct part_record *part_ptr = NULL; bitstr_t *usable_mp_bitmap = bit_alloc(node_record_count); /* Locks are already in place to protect part_list here */ itr = list_iterator_create(part_list); while ((part_ptr = list_next(itr))) { /* we only want to use mps that are in * partitions */ if (!part_ptr->node_bitmap) { debug4("Partition %s doesn't have any nodes in it.", part_ptr->name); continue; } bit_or(usable_mp_bitmap, part_ptr->node_bitmap); } list_iterator_destroy(itr); if (bit_ffs(usable_mp_bitmap) == -1) { fatal("We don't have any nodes in any partitions. " "Can't create blocks. " "Please check your slurm.conf."); } slurm_mutex_lock(&block_state_mutex); reset_ba_system(false); ba_set_removable_mps(usable_mp_bitmap, 1); if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (bg_record->mp_count > 0 && !bg_record->full_block && bg_record->cpu_cnt >= bg_conf->cpus_per_mp) { char *name = NULL; char start_char[SYSTEM_DIMENSIONS+1]; char geo_char[SYSTEM_DIMENSIONS+1]; if (overlapped == LAYOUT_OVERLAP) { reset_ba_system(false); ba_set_removable_mps(usable_mp_bitmap, 1); } /* we want the mps that aren't * in this record to mark them as used */ if (ba_set_removable_mps( bg_record->mp_bitmap, 1) != SLURM_SUCCESS) fatal("It doesn't seem we have a " "bitmap for %s", bg_record->bg_block_id); for (i=0; i<SYSTEM_DIMENSIONS; i++) { geo[i] = bg_record->geo[i]; start_char[i] = alpha_num[ bg_record->start[i]]; geo_char[i] = alpha_num[geo[i]]; } start_char[i] = '\0'; geo_char[i] = '\0'; debug2("adding %s %s %s", bg_record->mp_str, start_char, geo_char); if (bg_record->ba_mp_list && list_count(bg_record->ba_mp_list)) { if ((rc = check_and_set_mp_list( bg_record->ba_mp_list)) != SLURM_SUCCESS) { debug2("something happened in " "the load of %s" "Did you use smap to " "make the " "bluegene.conf file?", bg_record->bg_block_id); break; } } else { #ifdef HAVE_BGQ List results = list_create(destroy_ba_mp); #else List results = list_create(NULL); #endif name = set_bg_block( results, bg_record->start, geo, bg_record->conn_type); ba_reset_all_removed_mps(); if (!name) { error("I was unable to " "make the " "requested block."); list_destroy(results); rc = SLURM_ERROR; break; } snprintf(temp, sizeof(temp), "%s%s", bg_conf->slurm_node_prefix, name); xfree(name); if (strcmp(temp, bg_record->mp_str)) { fatal("given list of %s " "but allocated %s, " "your order might be " "wrong in bluegene.conf", bg_record->mp_str, temp); } if (bg_record->ba_mp_list) list_destroy( bg_record->ba_mp_list); #ifdef HAVE_BGQ bg_record->ba_mp_list = results; results = NULL; #else bg_record->ba_mp_list = list_create(destroy_ba_mp); copy_node_path(results, &bg_record->ba_mp_list); list_destroy(results); #endif } } if (!block_exist_in_list( bg_found_block_list, bg_record)) { if (bg_record->full_block) { /* if this is defined we need to remove it since we are going to try to create it later on overlap systems this doesn't matter, but since we don't clear the table on static mode we can't do it here or it just won't work since other wires will be or are already set */ list_remove(itr); continue; } if ((rc = bridge_block_create(bg_record)) != SLURM_SUCCESS) break; print_bg_record(bg_record); } } list_iterator_destroy(itr); if (rc != SLURM_SUCCESS) goto end_it; } else { error("create_defined_blocks: no bg_lists->main 2"); rc = SLURM_ERROR; goto end_it; } slurm_mutex_unlock(&block_state_mutex); create_full_system_block(bg_found_block_list); slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); end_it: ba_reset_all_removed_mps(); FREE_NULL_BITMAP(usable_mp_bitmap); slurm_mutex_unlock(&block_state_mutex); #ifdef _PRINT_BLOCKS_AND_EXIT if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); debug("\n\n"); while ((found_record = (bg_record_t *) list_next(itr)) != NULL) { print_bg_record(found_record); } list_iterator_destroy(itr); } else { error("create_defined_blocks: no bg_lists->main 5"); } exit(0); #endif /* _PRINT_BLOCKS_AND_EXIT */ //exit(0); return rc; }