/* * Read and process the bluegene.conf configuration file so to interpret what * blocks are static/dynamic, torus/mesh, etc. */ extern int read_bg_conf(void) { int i; int count = 0; s_p_hashtbl_t *tbl = NULL; char *layout = NULL; select_ba_request_t **blockreq_array = NULL; image_t **image_array = NULL; image_t *image = NULL; static time_t last_config_update = (time_t) 0; struct stat config_stat; ListIterator itr = NULL; char* bg_conf_file = NULL; static int *dims = NULL; if (!dims) dims = select_g_ba_get_dims(); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Reading the bluegene.conf file"); /* check if config file has changed */ bg_conf_file = _get_bg_conf(); if (stat(bg_conf_file, &config_stat) < 0) fatal("can't stat bluegene.conf file %s: %m", bg_conf_file); if (last_config_update) { _reopen_bridge_log(); if (last_config_update == config_stat.st_mtime) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s unchanged", bg_conf_file); } else { info("Restart slurmctld for %s changes " "to take effect", bg_conf_file); } last_config_update = config_stat.st_mtime; xfree(bg_conf_file); return SLURM_SUCCESS; } last_config_update = config_stat.st_mtime; /* initialization */ /* bg_conf defined in bg_node_alloc.h */ tbl = s_p_hashtbl_create(bg_conf_file_options); if (s_p_parse_file(tbl, NULL, bg_conf_file, false) == SLURM_ERROR) fatal("something wrong with opening/reading bluegene " "conf file"); xfree(bg_conf_file); #ifdef HAVE_BGL if (s_p_get_array((void ***)&image_array, &count, "AltBlrtsImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->blrts_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_blrtsimage, "BlrtsImage", tbl)) { if (!list_count(bg_conf->blrts_list)) fatal("BlrtsImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->blrts_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_blrtsimage = xstrdup(image->name); info("Warning: using %s as the default BlrtsImage. " "If this isn't correct please set BlrtsImage", bg_conf->default_blrtsimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default BlrtsImage %s", bg_conf->default_blrtsimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_blrtsimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->blrts_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltLinuxImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "LinuxImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("LinuxImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default LinuxImage. " "If this isn't correct please set LinuxImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default LinuxImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltRamDiskImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "RamDiskImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("RamDiskImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default RamDiskImage. " "If this isn't correct please set RamDiskImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default RamDiskImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #elif defined HAVE_BGP if (s_p_get_array((void ***)&image_array, &count, "AltCnloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "CnloadImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("CnloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default CnloadImage. " "If this isn't correct please set CnloadImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default CnloadImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltIoloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "IoloadImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("IoloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default IoloadImage. " "If this isn't correct please set IoloadImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default IoloadImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #endif if (s_p_get_array((void ***)&image_array, &count, "AltMloaderImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->mloader_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_mloaderimage, "MloaderImage", tbl)) { if (!list_count(bg_conf->mloader_list)) fatal("MloaderImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->mloader_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_mloaderimage = xstrdup(image->name); info("Warning: using %s as the default MloaderImage. " "If this isn't correct please set MloaderImage", bg_conf->default_mloaderimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default MloaderImage %s", bg_conf->default_mloaderimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_mloaderimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->mloader_list, image); } if (!s_p_get_uint16( &bg_conf->mp_cnode_cnt, "BasePartitionNodeCnt", tbl)) { error("BasePartitionNodeCnt not configured in bluegene.conf " "defaulting to 512 as BasePartitionNodeCnt"); bg_conf->mp_cnode_cnt = 512; bg_conf->quarter_cnode_cnt = 128; } else { if (bg_conf->mp_cnode_cnt <= 0) fatal("You should have more than 0 nodes " "per base partition"); bg_conf->quarter_cnode_cnt = bg_conf->mp_cnode_cnt/4; } /* bg_conf->cpus_per_mp should had already been set from the * node_init */ if (bg_conf->cpus_per_mp < bg_conf->mp_cnode_cnt) { fatal("For some reason we have only %u cpus per mp, but " "have %u cnodes per mp. You need at least the same " "number of cpus as you have cnodes per mp. " "Check the NodeName Procs= " "definition in the slurm.conf.", bg_conf->cpus_per_mp, bg_conf->mp_cnode_cnt); } bg_conf->cpu_ratio = bg_conf->cpus_per_mp/bg_conf->mp_cnode_cnt; if (!bg_conf->cpu_ratio) fatal("We appear to have less than 1 cpu on a cnode. " "You specified %u for BasePartitionNodeCnt " "in the blugene.conf and %u cpus " "for each node in the slurm.conf", bg_conf->mp_cnode_cnt, bg_conf->cpus_per_mp); num_unused_cpus = 1; for (i = 0; i<SYSTEM_DIMENSIONS; i++) num_unused_cpus *= dims[i]; num_unused_cpus *= bg_conf->cpus_per_mp; if (!s_p_get_uint16( &bg_conf->nodecard_cnode_cnt, "NodeCardNodeCnt", tbl)) { error("NodeCardNodeCnt not configured in bluegene.conf " "defaulting to 32 as NodeCardNodeCnt"); bg_conf->nodecard_cnode_cnt = 32; } if (bg_conf->nodecard_cnode_cnt<=0) fatal("You should have more than 0 nodes per nodecard"); bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt / bg_conf->nodecard_cnode_cnt; if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "Numpsets", tbl)) fatal("Warning: Numpsets not configured in bluegene.conf"); if (!bg_conf->ionodes_per_mp) { if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "IONodesPerMP", tbl)) fatal("Warning: IONodesPerMP not configured " "in bluegene.conf"); } #ifdef HAVE_BGQ /* You can only have 16 ionodes per midplane */ if (bg_conf->ionodes_per_mp > bg_conf->mp_nodecard_cnt) bg_conf->ionodes_per_mp = bg_conf->mp_nodecard_cnt; #endif if (bg_conf->ionodes_per_mp) { bitstr_t *tmp_bitmap = NULL; int small_size = 1; /* THIS IS A HACK TO MAKE A 1 NODECARD SYSTEM WORK */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) { bg_conf->quarter_ionode_cnt = 2; bg_conf->nodecard_ionode_cnt = 2; } else { bg_conf->quarter_ionode_cnt = bg_conf->ionodes_per_mp/4; bg_conf->nodecard_ionode_cnt = bg_conf->quarter_ionode_cnt/4; } /* How many nodecards per ionode */ bg_conf->nc_ratio = ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt) / (double)bg_conf->ionodes_per_mp; /* How many ionodes per nodecard */ bg_conf->io_ratio = (double)bg_conf->ionodes_per_mp / ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt); //info("got %f %f", bg_conf->nc_ratio, bg_conf->io_ratio); /* figure out the smallest block we can have on the system */ #ifdef HAVE_BGL if (bg_conf->io_ratio >= 1) bg_conf->smallest_block=32; else bg_conf->smallest_block=128; #else if (bg_conf->io_ratio >= 2) bg_conf->smallest_block=16; else if (bg_conf->io_ratio == 1) bg_conf->smallest_block=32; else if (bg_conf->io_ratio == .5) bg_conf->smallest_block=64; else if (bg_conf->io_ratio == .25) bg_conf->smallest_block=128; else if (bg_conf->io_ratio == .125) bg_conf->smallest_block=256; else { error("unknown ioratio %f. Can't figure out " "smallest block size, setting it to midplane", bg_conf->io_ratio); bg_conf->smallest_block = 512; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Smallest block possible on this system is %u", bg_conf->smallest_block); /* below we are creating all the possible bitmaps for * each size of small block */ if ((int)bg_conf->nodecard_ionode_cnt < 1) { bg_conf->nodecard_ionode_cnt = 0; } else { bg_lists->valid_small32 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small32, tmp_bitmap); } } /* If we only have 1 nodecard just jump to the end since this will never need to happen below. Pretty much a hack to avoid seg fault;). */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) goto no_calc; bg_lists->valid_small128 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small128, tmp_bitmap); } #ifndef HAVE_BGL bg_lists->valid_small64 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small64, tmp_bitmap); } bg_lists->valid_small256 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small256, tmp_bitmap); } #endif } else { fatal("your ionodes_per_mp is 0"); } no_calc: if (!s_p_get_uint16(&bg_conf->bridge_api_verb, "BridgeAPIVerbose", tbl)) info("Warning: BridgeAPIVerbose not configured " "in bluegene.conf"); if (!s_p_get_string(&bg_conf->bridge_api_file, "BridgeAPILogFile", tbl)) info("BridgeAPILogFile not configured in bluegene.conf"); else _reopen_bridge_log(); if (s_p_get_string(&layout, "DenyPassthrough", tbl)) { if (strstr(layout, "A")) ba_deny_pass |= PASS_DENY_A; if (strstr(layout, "X")) ba_deny_pass |= PASS_DENY_X; if (strstr(layout, "Y")) ba_deny_pass |= PASS_DENY_Y; if (strstr(layout, "Z")) ba_deny_pass |= PASS_DENY_Z; if (!strcasecmp(layout, "ALL")) ba_deny_pass |= PASS_DENY_ALL; bg_conf->deny_pass = ba_deny_pass; xfree(layout); } if (!s_p_get_string(&layout, "LayoutMode", tbl)) { info("Warning: LayoutMode was not specified in bluegene.conf " "defaulting to STATIC partitioning"); bg_conf->layout_mode = LAYOUT_STATIC; } else { if (!strcasecmp(layout,"STATIC")) bg_conf->layout_mode = LAYOUT_STATIC; else if (!strcasecmp(layout,"OVERLAP")) bg_conf->layout_mode = LAYOUT_OVERLAP; else if (!strcasecmp(layout,"DYNAMIC")) bg_conf->layout_mode = LAYOUT_DYNAMIC; else { fatal("I don't understand this LayoutMode = %s", layout); } xfree(layout); } /* add blocks defined in file */ if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { if (!s_p_get_array((void ***)&blockreq_array, &count, "BPs", tbl)) { info("WARNING: no blocks defined in bluegene.conf, " "only making full system block"); /* create_full_system_block(NULL); */ } for (i = 0; i < count; i++) { add_bg_record(bg_lists->main, NULL, blockreq_array[i], 0, 0); } } s_p_hashtbl_destroy(tbl); return SLURM_SUCCESS; }
/* * create_dynamic_block - create new block(s) to be used for a new * job allocation. * RET - a list of created block(s) or NULL on failure errno is set. */ extern List create_dynamic_block(List block_list, select_ba_request_t *request, List my_block_list, bool track_down_nodes) { int rc = SLURM_SUCCESS; ListIterator itr, itr2; bg_record_t *bg_record = NULL, *found_record = NULL; List results = NULL; List new_blocks = NULL; bitstr_t *my_bitmap = NULL; select_ba_request_t blockreq; int cnodes = request->procs / bg_conf->cpu_ratio; uint16_t start_geo[SYSTEM_DIMENSIONS]; if (cnodes < bg_conf->smallest_block) { error("Can't create this size %d " "on this system ionodes_per_mp is %d", request->procs, bg_conf->ionodes_per_mp); goto finished; } memset(&blockreq, 0, sizeof(select_ba_request_t)); memcpy(start_geo, request->geometry, sizeof(start_geo)); /* We need to lock this just incase a blocks_overlap is called which will in turn reset and set the system as it sees fit. */ slurm_mutex_lock(&block_state_mutex); if (my_block_list) { reset_ba_system(track_down_nodes); itr = list_iterator_create(my_block_list); while ((bg_record = list_next(itr))) { if (bg_record->magic != BLOCK_MAGIC) { /* This should never happen since we only call this on copies of blocks and we check on this during the copy. */ error("create_dynamic_block: " "got a block with bad magic?"); continue; } if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("not adding %s(%s) %s %s %s %u " "(free_cnt)", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } continue; } if (!my_bitmap) { my_bitmap = bit_alloc(bit_size(bg_record->bitmap)); } if (!bit_super_set(bg_record->bitmap, my_bitmap)) { bit_or(my_bitmap, bg_record->bitmap); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("adding %s(%s) %s %s %s %u", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } if (check_and_set_mp_list( bg_record->ba_mp_list) == SLURM_ERROR) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("something happened in " "the load of %s", bg_record->bg_block_id); list_iterator_destroy(itr); FREE_NULL_BITMAP(my_bitmap); rc = SLURM_ERROR; goto finished; } } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { int dim; char start_geo[SYSTEM_DIMENSIONS+1]; char geo[SYSTEM_DIMENSIONS+1]; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) { start_geo[dim] = alpha_num[ bg_record->start[dim]]; geo[dim] = alpha_num[ bg_record->geo[dim]]; } start_geo[dim] = '\0'; geo[dim] = '\0'; info("not adding %s(%s) %s %s %s %u ", bg_record->bg_block_id, bg_record->mp_str, bg_block_state_string( bg_record->state), start_geo, geo, bg_record->cnode_cnt); } /* just so we don't look at it later */ bg_record->free_cnt = -1; } } list_iterator_destroy(itr); FREE_NULL_BITMAP(my_bitmap); } else { reset_ba_system(false); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("No list was given"); } if (request->avail_mp_bitmap) ba_set_removable_mps(request->avail_mp_bitmap, 1); if (request->size==1 && cnodes < bg_conf->mp_cnode_cnt) { switch(cnodes) { #ifdef HAVE_BGL case 32: blockreq.small32 = 4; blockreq.small128 = 3; break; case 128: blockreq.small128 = 4; break; #else case 16: blockreq.small16 = 2; blockreq.small32 = 1; blockreq.small64 = 1; blockreq.small128 = 1; blockreq.small256 = 1; break; case 32: blockreq.small32 = 2; blockreq.small64 = 1; blockreq.small128 = 1; blockreq.small256 = 1; break; case 64: blockreq.small64 = 2; blockreq.small128 = 1; blockreq.small256 = 1; break; case 128: blockreq.small128 = 2; blockreq.small256 = 1; break; case 256: blockreq.small256 = 2; break; #endif default: error("This size %d is unknown on this system", cnodes); goto finished; break; } /* Sort the list so the small blocks are in the order * of ionodes. */ list_sort(block_list, (ListCmpF)bg_record_cmpf_inc); request->conn_type[0] = SELECT_SMALL; new_blocks = list_create(destroy_bg_record); /* check only blocks that are free and small */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, true, true) == SLURM_SUCCESS) goto finished; /* check only blocks that are free and any size */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, true, false) == SLURM_SUCCESS) goto finished; /* check usable blocks that are small with any state */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, false, true) == SLURM_SUCCESS) goto finished; /* check all usable blocks */ if (_breakup_blocks(block_list, new_blocks, request, my_block_list, false, false) == SLURM_SUCCESS) goto finished; /* Re-sort the list back to the original order. */ list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); list_destroy(new_blocks); new_blocks = NULL; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("small block not able to be placed inside others"); } if (request->conn_type[0] == SELECT_NAV) request->conn_type[0] = SELECT_TORUS; //debug("going to create %d", request->size); if (!new_ba_request(request)) { if (request->geometry[0] != (uint16_t)NO_VAL) { char *geo = give_geo(request->geometry); error("Problems with request for size %d geo %s", request->size, geo); xfree(geo); } else { error("Problems with request for size %d. " "No geo given.", request->size); } rc = ESLURM_INTERCONNECT_FAILURE; goto finished; } /* try on free midplanes */ rc = SLURM_SUCCESS; if (results) list_flush(results); else { #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif } rc = allocate_block(request, results); /* This could be changed in allocate_block so set it back up */ memcpy(request->geometry, start_geo, sizeof(start_geo)); if (rc) { rc = SLURM_SUCCESS; goto setup_records; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("allocate failure for size %d base " "partitions of free midplanes", request->size); rc = SLURM_ERROR; if (!list_count(my_block_list) || !my_block_list) goto finished; /*Try to put block starting in the smallest of the exisiting blocks*/ itr = list_iterator_create(my_block_list); itr2 = list_iterator_create(my_block_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { bool is_small = 0; /* never check a block with a job running */ if (bg_record->free_cnt || bg_record->job_running != NO_JOB_RUNNING) continue; /* Here we are only looking for the first block on the midplane. So either the count is greater or equal than bg_conf->mp_cnode_cnt or the first bit is set in the ionode_bitmap. */ if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) { bool found = 0; if (bit_ffs(bg_record->ionode_bitmap) != 0) continue; /* Check to see if we have other blocks in this midplane that have jobs running. */ while ((found_record = list_next(itr2))) { if (!found_record->free_cnt && (found_record->job_running != NO_JOB_RUNNING) && bit_overlap(bg_record->bitmap, found_record->bitmap)) { found = 1; break; } } list_iterator_reset(itr2); if (found) continue; is_small = 1; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("removing %s(%s) for request %d", bg_record->bg_block_id, bg_record->mp_str, request->size); remove_block(bg_record->ba_mp_list, is_small); rc = SLURM_SUCCESS; if (results) list_flush(results); else { #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif } rc = allocate_block(request, results); /* This could be changed in allocate_block so set it back up */ memcpy(request->geometry, start_geo, sizeof(start_geo)); if (rc) { rc = SLURM_SUCCESS; break; } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("allocate failure for size %d base partitions", request->size); rc = SLURM_ERROR; } list_iterator_destroy(itr); list_iterator_destroy(itr2); setup_records: if (rc == SLURM_SUCCESS) { /*set up bg_record(s) here */ new_blocks = list_create(destroy_bg_record); blockreq.save_name = request->save_name; #ifdef HAVE_BGL blockreq.blrtsimage = request->blrtsimage; #endif blockreq.linuximage = request->linuximage; blockreq.mloaderimage = request->mloaderimage; blockreq.ramdiskimage = request->ramdiskimage; memcpy(blockreq.conn_type, request->conn_type, sizeof(blockreq.conn_type)); add_bg_record(new_blocks, &results, &blockreq, 0, 0); } finished: if (request->avail_mp_bitmap && (bit_ffc(request->avail_mp_bitmap) == -1)) ba_reset_all_removed_mps(); slurm_mutex_unlock(&block_state_mutex); /* reset the ones we mucked with */ itr = list_iterator_create(my_block_list); while ((bg_record = (bg_record_t *) list_next(itr))) { if (bg_record->free_cnt == -1) bg_record->free_cnt = 0; } list_iterator_destroy(itr); xfree(request->save_name); if (results) list_destroy(results); errno = rc; return new_blocks; }
extern int create_full_system_block(List bg_found_block_list) { int rc = SLURM_SUCCESS; ListIterator itr; bg_record_t *bg_record = NULL; char *name = NULL; List records = NULL; uint16_t geo[SYSTEM_DIMENSIONS]; int i; select_ba_request_t blockreq; List results = NULL; struct part_record *part_ptr = NULL; bitstr_t *bitmap = bit_alloc(node_record_count); static int *dims = NULL; bool larger = 0; char start_char[SYSTEM_DIMENSIONS+1]; char geo_char[SYSTEM_DIMENSIONS+1]; if (!dims) { dims = select_g_ba_get_dims(); memset(start_char, 0, sizeof(start_char)); memset(geo_char, 0, sizeof(geo_char)); } /* Locks are already in place to protect part_list here */ itr = list_iterator_create(part_list); while ((part_ptr = list_next(itr))) { /* we only want to use mps that are in * partitions */ if (!part_ptr->node_bitmap) { debug4("Partition %s doesn't have any nodes in it.", part_ptr->name); continue; } bit_or(bitmap, part_ptr->node_bitmap); } list_iterator_destroy(itr); bit_not(bitmap); if (bit_ffs(bitmap) != -1) { error("We don't have the entire system covered by partitions, " "can't create full system block"); FREE_NULL_BITMAP(bitmap); return SLURM_ERROR; } FREE_NULL_BITMAP(bitmap); /* Here we are adding a block that in for the entire machine just in case it isn't in the bluegene.conf file. */ slurm_mutex_lock(&block_state_mutex); for (i=0; i<SYSTEM_DIMENSIONS; i++) { geo[i] = dims[i] - 1; if (geo[i] > 0) larger = 1; geo_char[i] = alpha_num[geo[i]]; start_char[i] = alpha_num[0]; } i = (10+strlen(bg_conf->slurm_node_prefix)); name = xmalloc(i); if (!larger) snprintf(name, i, "%s%s", bg_conf->slurm_node_prefix, start_char); else snprintf(name, i, "%s[%sx%s]", bg_conf->slurm_node_prefix, start_char, geo_char); if (bg_found_block_list) { itr = list_iterator_create(bg_found_block_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (!strcmp(name, bg_record->mp_str)) { xfree(name); list_iterator_destroy(itr); /* don't create total already there */ goto no_total; } } list_iterator_destroy(itr); } else { error("create_full_system_block: no bg_found_block_list 2"); } if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (!strcmp(name, bg_record->mp_str)) { xfree(name); list_iterator_destroy(itr); /* don't create total already there */ goto no_total; } } list_iterator_destroy(itr); } else { xfree(name); error("create_overlapped_blocks: no bg_lists->main 3"); rc = SLURM_ERROR; goto no_total; } records = list_create(destroy_bg_record); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.save_name = name; for (i=0; i<SYSTEM_DIMENSIONS; i++) blockreq.conn_type[i] = SELECT_TORUS; add_bg_record(records, NULL, &blockreq, 0 , 0); xfree(name); bg_record = (bg_record_t *) list_pop(records); if (!bg_record) { error("Nothing was returned from full system create"); rc = SLURM_ERROR; goto no_total; } reset_ba_system(false); for(i=0; i<SYSTEM_DIMENSIONS; i++) { geo_char[i] = alpha_num[bg_record->geo[i]]; start_char[i] = alpha_num[bg_record->start[i]]; } debug2("adding %s %s %s", bg_record->mp_str, start_char, geo_char); if (bg_record->ba_mp_list) list_flush(bg_record->ba_mp_list); else bg_record->ba_mp_list = list_create(destroy_ba_mp); #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif name = set_bg_block(results, bg_record->start, bg_record->geo, bg_record->conn_type); if (!name) { error("I was unable to make the full system block."); list_destroy(results); list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); return SLURM_ERROR; } xfree(name); if (bg_record->ba_mp_list) list_destroy(bg_record->ba_mp_list); #ifdef HAVE_BGQ bg_record->ba_mp_list = results; results = NULL; #else bg_record->ba_mp_list = list_create(destroy_ba_mp); copy_node_path(results, &bg_record->ba_mp_list); list_destroy(results); #endif if ((rc = bridge_block_create(bg_record)) == SLURM_ERROR) { error("create_full_system_block: " "unable to configure block in api"); destroy_bg_record(bg_record); goto no_total; } print_bg_record(bg_record); list_append(bg_lists->main, bg_record); no_total: if (records) list_destroy(records); slurm_mutex_unlock(&block_state_mutex); return rc; }
/* * Read and process the bluegene.conf configuration file so to interpret what * blocks are static/dynamic, torus/mesh, etc. */ extern int read_bg_conf(void) { int i; bool tmp_bool = 0; int count = 0; s_p_hashtbl_t *tbl = NULL; char *tmp_char = NULL; select_ba_request_t **blockreq_array = NULL; image_t **image_array = NULL; image_t *image = NULL; static time_t last_config_update = (time_t) 0; struct stat config_stat; ListIterator itr = NULL; char* bg_conf_file = NULL; static int *dims = NULL; if (!dims) dims = select_g_ba_get_dims(); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Reading the bluegene.conf file"); /* check if config file has changed */ bg_conf_file = get_extra_conf_path("bluegene.conf"); if (stat(bg_conf_file, &config_stat) < 0) fatal("can't stat bluegene.conf file %s: %m", bg_conf_file); if (last_config_update) { _reopen_bridge_log(); if (last_config_update == config_stat.st_mtime) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s unchanged", bg_conf_file); } else { info("Restart slurmctld for %s changes " "to take effect", bg_conf_file); } last_config_update = config_stat.st_mtime; xfree(bg_conf_file); return SLURM_SUCCESS; } last_config_update = config_stat.st_mtime; /* initialization */ /* bg_conf defined in bg_node_alloc.h */ if (!(tbl = config_make_tbl(bg_conf_file))) fatal("something wrong with opening/reading bluegene " "conf file"); xfree(bg_conf_file); #ifdef HAVE_BGL if (s_p_get_array((void ***)&image_array, &count, "AltBlrtsImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->blrts_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_blrtsimage, "BlrtsImage", tbl)) { if (!list_count(bg_conf->blrts_list)) fatal("BlrtsImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->blrts_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_blrtsimage = xstrdup(image->name); info("Warning: using %s as the default BlrtsImage. " "If this isn't correct please set BlrtsImage", bg_conf->default_blrtsimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default BlrtsImage %s", bg_conf->default_blrtsimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_blrtsimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->blrts_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltLinuxImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "LinuxImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("LinuxImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default LinuxImage. " "If this isn't correct please set LinuxImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default LinuxImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltRamDiskImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "RamDiskImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("RamDiskImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default RamDiskImage. " "If this isn't correct please set RamDiskImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default RamDiskImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #elif defined HAVE_BGP if (s_p_get_array((void ***)&image_array, &count, "AltCnloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "CnloadImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("CnloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default CnloadImage. " "If this isn't correct please set CnloadImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default CnloadImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltIoloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "IoloadImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("IoloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default IoloadImage. " "If this isn't correct please set IoloadImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default IoloadImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #endif if (s_p_get_array((void ***)&image_array, &count, "AltMloaderImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->mloader_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_mloaderimage, "MloaderImage", tbl)) { if (!list_count(bg_conf->mloader_list)) fatal("MloaderImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->mloader_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_mloaderimage = xstrdup(image->name); info("Warning: using %s as the default MloaderImage. " "If this isn't correct please set MloaderImage", bg_conf->default_mloaderimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default MloaderImage %s", bg_conf->default_mloaderimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_mloaderimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->mloader_list, image); } if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "MidplaneNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "BasePartitionNodeCnt", tbl)) { error("MidplaneNodeCnt not configured in bluegene.conf " "defaulting to 512 as MidplaneNodeCnt"); bg_conf->mp_cnode_cnt = 512; } } if (bg_conf->mp_cnode_cnt <= 0) fatal("You should have more than 0 nodes " "per midplane"); bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt; bg_conf->quarter_cnode_cnt = bg_conf->mp_cnode_cnt/4; /* bg_conf->cpus_per_mp should had already been set from the * node_init */ if (bg_conf->cpus_per_mp < bg_conf->mp_cnode_cnt) { fatal("For some reason we have only %u cpus per mp, but " "have %u cnodes per mp. You need at least the same " "number of cpus as you have cnodes per mp. " "Check the NodeName CPUs= " "definition in the slurm.conf.", bg_conf->cpus_per_mp, bg_conf->mp_cnode_cnt); } bg_conf->cpu_ratio = bg_conf->cpus_per_mp/bg_conf->mp_cnode_cnt; if (!bg_conf->cpu_ratio) fatal("We appear to have less than 1 cpu on a cnode. " "You specified %u for MidplaneNodeCnt " "in the blugene.conf and %u cpus " "for each node in the slurm.conf", bg_conf->mp_cnode_cnt, bg_conf->cpus_per_mp); num_unused_cpus = 1; for (i = 0; i<SYSTEM_DIMENSIONS; i++) num_unused_cpus *= dims[i]; num_unused_cpus *= bg_conf->cpus_per_mp; num_possible_unused_cpus = num_unused_cpus; if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeBoardNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeCardNodeCnt", tbl)) { error("NodeCardNodeCnt not configured in bluegene.conf " "defaulting to 32 as NodeCardNodeCnt"); bg_conf->nodecard_cnode_cnt = 32; } } if (bg_conf->nodecard_cnode_cnt <= 0) fatal("You should have more than 0 nodes per nodecard"); bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt / bg_conf->nodecard_cnode_cnt; if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "IONodesPerMP", tbl)) if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "Numpsets", tbl)) fatal("Warning: IONodesPerMP not configured " "in bluegene.conf"); s_p_get_uint16(&bg_conf->max_block_err, "MaxBlockInError", tbl); tmp_bool = 0; s_p_get_boolean(&tmp_bool, "SubMidplaneSystem", tbl); bg_conf->sub_mp_sys = tmp_bool; #ifdef HAVE_BGQ tmp_bool = 0; s_p_get_boolean(&tmp_bool, "AllowSubBlockAllocations", tbl); bg_conf->sub_blocks = tmp_bool; /* You can only have 16 ionodes per midplane */ if (bg_conf->ionodes_per_mp > bg_conf->mp_nodecard_cnt) bg_conf->ionodes_per_mp = bg_conf->mp_nodecard_cnt; #endif for (i=0; i<SYSTEM_DIMENSIONS; i++) bg_conf->default_conn_type[i] = (uint16_t)NO_VAL; s_p_get_string(&tmp_char, "DefaultConnType", tbl); if (tmp_char) { verify_conn_type(tmp_char, bg_conf->default_conn_type); if ((bg_conf->default_conn_type[0] != SELECT_MESH) && (bg_conf->default_conn_type[0] != SELECT_TORUS)) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); xfree(tmp_char); } else bg_conf->default_conn_type[0] = SELECT_TORUS; #ifndef HAVE_BG_L_P int first_conn_type = bg_conf->default_conn_type[0]; for (i=1; i<SYSTEM_DIMENSIONS; i++) { if (bg_conf->default_conn_type[i] == (uint16_t)NO_VAL) bg_conf->default_conn_type[i] = first_conn_type; else if (bg_conf->default_conn_type[i] >= SELECT_SMALL) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); } #endif if (bg_conf->ionodes_per_mp) { bitstr_t *tmp_bitmap = NULL; int small_size = 1; /* THIS IS A HACK TO MAKE A 1 NODECARD SYSTEM WORK, * Sometime on a Q system the nodecard isn't in the 0 * spot so only do this if you know it is in that * spot. Otherwise say the whole midplane is there * and just make blocks over the whole thing. They * you can error out the blocks that aren't usable. */ if (bg_conf->sub_mp_sys && bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) { #ifdef HAVE_BGQ bg_conf->quarter_ionode_cnt = 1; bg_conf->nodecard_ionode_cnt = 1; #else bg_conf->quarter_ionode_cnt = 2; bg_conf->nodecard_ionode_cnt = 2; #endif } else { bg_conf->quarter_ionode_cnt = bg_conf->ionodes_per_mp/4; bg_conf->nodecard_ionode_cnt = bg_conf->quarter_ionode_cnt/4; } /* How many nodecards per ionode */ bg_conf->nc_ratio = ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt) / (double)bg_conf->ionodes_per_mp; /* How many ionodes per nodecard */ bg_conf->io_ratio = (double)bg_conf->ionodes_per_mp / ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt); /* How many cnodes per ionode */ bg_conf->ionode_cnode_cnt = bg_conf->nodecard_cnode_cnt * bg_conf->nc_ratio; //info("got %f %f", bg_conf->nc_ratio, bg_conf->io_ratio); /* figure out the smallest block we can have on the system */ #ifdef HAVE_BGL if (bg_conf->io_ratio >= 1) bg_conf->smallest_block=32; else bg_conf->smallest_block=128; #else if (bg_conf->io_ratio >= 2) bg_conf->smallest_block=16; else if (bg_conf->io_ratio == 1) bg_conf->smallest_block=32; else if (bg_conf->io_ratio == .5) bg_conf->smallest_block=64; else if (bg_conf->io_ratio == .25) bg_conf->smallest_block=128; else if (bg_conf->io_ratio == .125) bg_conf->smallest_block=256; else { error("unknown ioratio %f. Can't figure out " "smallest block size, setting it to midplane", bg_conf->io_ratio); bg_conf->smallest_block = 512; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Smallest block possible on this system is %u", bg_conf->smallest_block); /* below we are creating all the possible bitmaps for * each size of small block */ if ((int)bg_conf->nodecard_ionode_cnt < 1) { bg_conf->nodecard_ionode_cnt = 0; } else { bg_lists->valid_small32 = list_create(_destroy_bitmap); /* This is suppose to be = and not ==, we only want to decrement when small_size equals something. */ if ((small_size = bg_conf->nodecard_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small32, tmp_bitmap); } } /* If we only have 1 nodecard just jump to the end since this will never need to happen below. Pretty much a hack to avoid seg fault;). */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) goto no_calc; bg_lists->valid_small128 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small128, tmp_bitmap); } #ifndef HAVE_BGL bg_lists->valid_small64 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small64, tmp_bitmap); } bg_lists->valid_small256 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small256, tmp_bitmap); } #endif } else { fatal("your ionodes_per_mp is 0"); } no_calc: if (!s_p_get_uint16(&bg_conf->bridge_api_verb, "BridgeAPIVerbose", tbl)) info("Warning: BridgeAPIVerbose not configured " "in bluegene.conf"); if (!s_p_get_string(&bg_conf->bridge_api_file, "BridgeAPILogFile", tbl)) info("BridgeAPILogFile not configured in bluegene.conf"); else _reopen_bridge_log(); if (s_p_get_string(&tmp_char, "DenyPassthrough", tbl)) { if (strstr(tmp_char, "A")) ba_deny_pass |= PASS_DENY_A; if (strstr(tmp_char, "X")) ba_deny_pass |= PASS_DENY_X; if (strstr(tmp_char, "Y")) ba_deny_pass |= PASS_DENY_Y; if (strstr(tmp_char, "Z")) ba_deny_pass |= PASS_DENY_Z; if (!xstrcasecmp(tmp_char, "ALL")) ba_deny_pass |= PASS_DENY_ALL; bg_conf->deny_pass = ba_deny_pass; xfree(tmp_char); } if (!s_p_get_string(&tmp_char, "LayoutMode", tbl)) { info("Warning: LayoutMode was not specified in bluegene.conf " "defaulting to STATIC partitioning"); bg_conf->layout_mode = LAYOUT_STATIC; } else { if (!xstrcasecmp(tmp_char,"STATIC")) bg_conf->layout_mode = LAYOUT_STATIC; else if (!xstrcasecmp(tmp_char,"OVERLAP")) bg_conf->layout_mode = LAYOUT_OVERLAP; else if (!xstrcasecmp(tmp_char,"DYNAMIC")) bg_conf->layout_mode = LAYOUT_DYNAMIC; else { fatal("I don't understand this LayoutMode = %s", tmp_char); } xfree(tmp_char); } /* add blocks defined in file */ if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { if (!s_p_get_array((void ***)&blockreq_array, &count, "MPs", tbl)) { if (!s_p_get_array((void ***)&blockreq_array, &count, "BPs", tbl)) { info("WARNING: no blocks defined in " "bluegene.conf, " "only making full system block"); /* create_full_system_block(NULL); */ if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) fatal("On a sub-midplane system you " "need to define the blocks you " "want on your system."); } } for (i = 0; i < count; i++) { add_bg_record(bg_lists->main, NULL, blockreq_array[i], 0, 0); } } else if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) /* we can't do dynamic here on a sub-midplane system */ fatal("On a sub-midplane system we can only do OVERLAP or " "STATIC LayoutMode. Please update your bluegene.conf."); #ifdef HAVE_BGQ if ((bg_recover != NOT_FROM_CONTROLLER) && assoc_mgr_qos_list && s_p_get_string(&tmp_char, "RebootQOSList", tbl)) { bool valid; char *token, *last = NULL; slurmdb_qos_rec_t *qos = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Lock here to avoid g_qos_count changing under us */ assoc_mgr_lock(&locks); bg_conf->reboot_qos_bitmap = bit_alloc(g_qos_count); itr = list_iterator_create(assoc_mgr_qos_list); token = strtok_r(tmp_char, ",", &last); while (token) { valid = false; while((qos = list_next(itr))) { if (!xstrcasecmp(token, qos->name)) { bit_set(bg_conf->reboot_qos_bitmap, qos->id); valid = true; break; } } if (!valid) error("Invalid RebootQOSList value: %s", token); list_iterator_reset(itr); token = strtok_r(NULL, ",", &last); } list_iterator_destroy(itr); xfree(tmp_char); assoc_mgr_unlock(&locks); } #endif s_p_hashtbl_destroy(tbl); return SLURM_SUCCESS; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we * will call the functions without locking the locks again. */ extern int down_nodecard(char *mp_name, bitoff_t io_start, bool slurmctld_locked) { List requests = NULL; List delete_list = NULL; ListIterator itr = NULL; bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; static int io_cnt = NO_VAL; static int create_size = NO_VAL; static select_ba_request_t blockreq; int rc = SLURM_SUCCESS; char *reason = "select_bluegene: nodecard down"; xassert(mp_name); if (io_cnt == NO_VAL) { io_cnt = 1; /* Translate 1 nodecard count to ionode count */ if ((io_cnt *= bg_conf->io_ratio)) io_cnt--; /* make sure we create something that is able to be created */ if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt) create_size = bg_conf->nodecard_cnode_cnt; else create_size = bg_conf->smallest_block; } node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified '%s'", mp_name); return EINVAL; } /* this is here for sanity check to make sure we don't core on these bits when we set them below. */ if (io_start >= bg_conf->ionodes_per_mp || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) { debug("io %d-%d not configured on this " "system, only %d ionodes per midplane", io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.conn_type[0] = SELECT_SMALL; blockreq.save_name = mp_name; debug3("here setting node %d of %d and ionodes %d-%d of %d", mp_bit, node_record_count, io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); memset(&tmp_record, 0, sizeof(bg_record_t)); tmp_record.mp_count = 1; tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt; tmp_record.mp_bitmap = bit_alloc(node_record_count); bit_set(tmp_record.mp_bitmap, mp_bit); tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!blocks_overlap(bg_record, &tmp_record)) continue; if (bg_record->job_running > NO_JOB_RUNNING) { if (slurmctld_locked) job_fail(bg_record->job_running); else slurm_fail_job(bg_record->job_running); } /* If Running Dynamic mode and the block is smaller than the create size just continue on. */ if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) && (bg_record->cnode_cnt < create_size)) { if (!delete_list) delete_list = list_create(NULL); list_append(delete_list, bg_record); continue; } /* keep track of the smallest size that is at least the size of create_size. */ if (!smallest_bg_record || (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt)) smallest_bg_record = bg_record; } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { debug3("running non-dynamic mode"); /* This should never happen, but just in case... */ if (delete_list) list_destroy(delete_list); /* If we found a block that is smaller or equal to a midplane we will just mark it in an error state as opposed to draining the node. */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug("No block under 1 midplane available for this nodecard. " "Draining the whole node."); if (!node_already_down(mp_name)) { if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; } /* below is only for Dynamic mode */ if (delete_list) { int cnt_set = 0; bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp); /* don't lock here since it is handled inside the put_block_in_error_state */ itr = list_iterator_create(delete_list); while ((bg_record = list_next(itr))) { debug2("combining smaller than nodecard " "dynamic block %s", bg_record->bg_block_id); while (bg_record->job_running > NO_JOB_RUNNING) sleep(1); bit_or(iobitmap, bg_record->ionode_bitmap); cnt_set++; } list_iterator_destroy(itr); list_destroy(delete_list); if (!cnt_set) { FREE_NULL_BITMAP(iobitmap); rc = SLURM_ERROR; goto cleanup; } /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ if ((io_start = bit_ffs(iobitmap)) == -1) { io_start = 0; if (create_size > bg_conf->nodecard_cnode_cnt) blockreq.small128 = 4; else blockreq.small32 = 16; } else if (create_size <= bg_conf->nodecard_cnode_cnt) blockreq.small32 = 1; else /* this should never happen */ blockreq.small128 = 1; FREE_NULL_BITMAP(iobitmap); } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } while (smallest_bg_record->job_running > NO_JOB_RUNNING) sleep(1); if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } if (create_size > smallest_bg_record->cnode_cnt) { /* we should never get here. This means we * have a create_size that is bigger than a * block that is already made. */ rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug3("node count is %d", smallest_bg_record->cnode_cnt); switch(smallest_bg_record->cnode_cnt) { #ifndef HAVE_BGL case 64: blockreq.small32 = 2; break; case 256: blockreq.small32 = 8; break; #endif case 128: blockreq.small32 = 4; break; case 512: default: blockreq.small32 = 16; break; } if (create_size != bg_conf->nodecard_cnode_cnt) { blockreq.small128 = blockreq.small32 / 4; blockreq.small32 = 0; io_start = 0; } else if ((io_start = bit_ffs(smallest_bg_record->ionode_bitmap)) == -1) /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ io_start = 0; } else { switch(create_size) { #ifndef HAVE_BGL case 64: blockreq.small64 = 8; break; case 256: blockreq.small256 = 2; #endif case 32: blockreq.small32 = 16; break; case 128: blockreq.small128 = 4; break; case 512: if (!node_already_down(mp_name)) { char *reason = "select_bluegene: nodecard down"; if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes( mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; break; default: error("Unknown create size of %d", create_size); break; } /* since we don't have a block in this midplane we need to start at the beginning. */ io_start = 0; /* we also need a bg_block to pretend to be the smallest block that takes up the entire midplane. */ } /* Here we need to add blocks that take up nodecards on this midplane. Since Slurm only keeps track of midplanes natively this is the only want to handle this case. */ requests = list_create(destroy_bg_record); add_bg_record(requests, NULL, &blockreq, 1, io_start); slurm_mutex_lock(&block_state_mutex); delete_list = list_create(NULL); while ((bg_record = list_pop(requests))) { itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); list_remove(itr); } list_iterator_destroy(itr); /* we need to add this record since it doesn't exist */ if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("down_sub_node_blocks: " "unable to configure block in api"); continue; } debug("adding block %s to fill in small blocks " "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); list_append(bg_lists->main, bg_record); if (bit_overlap(bg_record->ionode_bitmap, tmp_record.ionode_bitmap)) { /* here we know the error block doesn't exist so just set the state here */ slurm_mutex_unlock(&block_state_mutex); rc = put_block_in_error_state(bg_record, reason); slurm_mutex_lock(&block_state_mutex); } } list_destroy(requests); if (delete_list) { slurm_mutex_unlock(&block_state_mutex); free_block_list(NO_VAL, delete_list, 0, 0); list_destroy(delete_list); } slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); last_bg_update = time(NULL); cleanup: FREE_NULL_BITMAP(tmp_record.mp_bitmap); FREE_NULL_BITMAP(tmp_record.ionode_bitmap); return rc; }