static int _dynamically_request(List block_list, int *blocks_added, select_ba_request_t *request, char *user_req_nodes, uint16_t query_mode) { List list_of_lists = NULL; List temp_list = NULL; List new_blocks = NULL; List job_list = NULL, booted_list = NULL; ListIterator itr = NULL; int rc = SLURM_ERROR; int create_try = 0; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("going to create %d", request->size); list_of_lists = list_create(NULL); /* If preempt is set and we are checking full system it means we altered the block list so only look at it. */ if (SELECT_IS_PREEMPT_SET(query_mode) && SELECT_IS_CHECK_FULL_SET(query_mode)) { list_append(list_of_lists, block_list); } else if (user_req_nodes) { slurm_mutex_lock(&block_state_mutex); job_list = copy_bg_list(bg_lists->job_running); list_append(list_of_lists, job_list); slurm_mutex_unlock(&block_state_mutex); } else { slurm_mutex_lock(&block_state_mutex); list_append(list_of_lists, block_list); if (list_count(block_list) != list_count(bg_lists->booted)) { booted_list = copy_bg_list(bg_lists->booted); list_append(list_of_lists, booted_list); if (list_count(bg_lists->booted) != list_count(bg_lists->job_running)) { job_list = copy_bg_list(bg_lists->job_running); list_append(list_of_lists, job_list); } } else if (list_count(block_list) != list_count(bg_lists->job_running)) { job_list = copy_bg_list(bg_lists->job_running); list_append(list_of_lists, job_list); } slurm_mutex_unlock(&block_state_mutex); } itr = list_iterator_create(list_of_lists); while ((temp_list = (List)list_next(itr))) { create_try++; /* 1- try empty space 2- we see if we can create one in the unused mps 3- see if we can create one in the non job running mps */ if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("trying with %d", create_try); if ((new_blocks = create_dynamic_block( block_list, request, temp_list, true))) { bg_record_t *bg_record = NULL; while ((bg_record = list_pop(new_blocks))) { if (block_exist_in_list(block_list, bg_record)) destroy_bg_record(bg_record); else if (SELECT_IS_TEST(query_mode) || SELECT_IS_PREEMPT_ON_FULL_TEST( query_mode)) { /* Here we don't really want to create the block if we are testing. The second test here is to make sure If we are able to run here but we just preempted we should wait a bit to make sure the preempted blocks have time to clear out. */ list_append(block_list, bg_record); (*blocks_added) = 1; } else { if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("_dynamically_request: " "unable to configure " "block"); rc = SLURM_ERROR; break; } list_append(block_list, bg_record); print_bg_record(bg_record); (*blocks_added) = 1; } } list_destroy(new_blocks); if (!*blocks_added) { rc = SLURM_ERROR; continue; } list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); rc = SLURM_SUCCESS; break; } else if (errno == ESLURM_INTERCONNECT_FAILURE) { rc = SLURM_ERROR; break; } } list_iterator_destroy(itr); if (list_of_lists) list_destroy(list_of_lists); if (job_list) list_destroy(job_list); if (booted_list) list_destroy(booted_list); return rc; }
extern int create_full_system_block(List bg_found_block_list) { int rc = SLURM_SUCCESS; ListIterator itr; bg_record_t *bg_record = NULL; char *name = NULL; List records = NULL; uint16_t geo[SYSTEM_DIMENSIONS]; int i; select_ba_request_t blockreq; List results = NULL; struct part_record *part_ptr = NULL; bitstr_t *bitmap = bit_alloc(node_record_count); static int *dims = NULL; bool larger = 0; char start_char[SYSTEM_DIMENSIONS+1]; char geo_char[SYSTEM_DIMENSIONS+1]; if (!dims) { dims = select_g_ba_get_dims(); memset(start_char, 0, sizeof(start_char)); memset(geo_char, 0, sizeof(geo_char)); } /* Locks are already in place to protect part_list here */ itr = list_iterator_create(part_list); while ((part_ptr = list_next(itr))) { /* we only want to use mps that are in * partitions */ if (!part_ptr->node_bitmap) { debug4("Partition %s doesn't have any nodes in it.", part_ptr->name); continue; } bit_or(bitmap, part_ptr->node_bitmap); } list_iterator_destroy(itr); bit_not(bitmap); if (bit_ffs(bitmap) != -1) { error("We don't have the entire system covered by partitions, " "can't create full system block"); FREE_NULL_BITMAP(bitmap); return SLURM_ERROR; } FREE_NULL_BITMAP(bitmap); /* Here we are adding a block that in for the entire machine just in case it isn't in the bluegene.conf file. */ slurm_mutex_lock(&block_state_mutex); for (i=0; i<SYSTEM_DIMENSIONS; i++) { geo[i] = dims[i] - 1; if (geo[i] > 0) larger = 1; geo_char[i] = alpha_num[geo[i]]; start_char[i] = alpha_num[0]; } i = (10+strlen(bg_conf->slurm_node_prefix)); name = xmalloc(i); if (!larger) snprintf(name, i, "%s%s", bg_conf->slurm_node_prefix, start_char); else snprintf(name, i, "%s[%sx%s]", bg_conf->slurm_node_prefix, start_char, geo_char); if (bg_found_block_list) { itr = list_iterator_create(bg_found_block_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (!strcmp(name, bg_record->mp_str)) { xfree(name); list_iterator_destroy(itr); /* don't create total already there */ goto no_total; } } list_iterator_destroy(itr); } else { error("create_full_system_block: no bg_found_block_list 2"); } if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (!strcmp(name, bg_record->mp_str)) { xfree(name); list_iterator_destroy(itr); /* don't create total already there */ goto no_total; } } list_iterator_destroy(itr); } else { xfree(name); error("create_overlapped_blocks: no bg_lists->main 3"); rc = SLURM_ERROR; goto no_total; } records = list_create(destroy_bg_record); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.save_name = name; for (i=0; i<SYSTEM_DIMENSIONS; i++) blockreq.conn_type[i] = SELECT_TORUS; add_bg_record(records, NULL, &blockreq, 0 , 0); xfree(name); bg_record = (bg_record_t *) list_pop(records); if (!bg_record) { error("Nothing was returned from full system create"); rc = SLURM_ERROR; goto no_total; } reset_ba_system(false); for(i=0; i<SYSTEM_DIMENSIONS; i++) { geo_char[i] = alpha_num[bg_record->geo[i]]; start_char[i] = alpha_num[bg_record->start[i]]; } debug2("adding %s %s %s", bg_record->mp_str, start_char, geo_char); if (bg_record->ba_mp_list) list_flush(bg_record->ba_mp_list); else bg_record->ba_mp_list = list_create(destroy_ba_mp); #ifdef HAVE_BGQ results = list_create(destroy_ba_mp); #else results = list_create(NULL); #endif name = set_bg_block(results, bg_record->start, bg_record->geo, bg_record->conn_type); if (!name) { error("I was unable to make the full system block."); list_destroy(results); list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); return SLURM_ERROR; } xfree(name); if (bg_record->ba_mp_list) list_destroy(bg_record->ba_mp_list); #ifdef HAVE_BGQ bg_record->ba_mp_list = results; results = NULL; #else bg_record->ba_mp_list = list_create(destroy_ba_mp); copy_node_path(results, &bg_record->ba_mp_list); list_destroy(results); #endif if ((rc = bridge_block_create(bg_record)) == SLURM_ERROR) { error("create_full_system_block: " "unable to configure block in api"); destroy_bg_record(bg_record); goto no_total; } print_bg_record(bg_record); list_append(bg_lists->main, bg_record); no_total: if (records) list_destroy(records); slurm_mutex_unlock(&block_state_mutex); return rc; }
/* * create_defined_blocks - create the static blocks that will be used * for scheduling, all partitions must be able to be created and booted * at once. * IN - int overlapped, 1 if partitions are to be overlapped, 0 if they are * static. * RET - success of fitting all configurations */ extern int create_defined_blocks(bg_layout_t overlapped, List bg_found_block_list) { int rc = SLURM_SUCCESS; ListIterator itr; bg_record_t *bg_record = NULL; int i; uint16_t geo[SYSTEM_DIMENSIONS]; char temp[256]; struct part_record *part_ptr = NULL; bitstr_t *usable_mp_bitmap = bit_alloc(node_record_count); /* Locks are already in place to protect part_list here */ itr = list_iterator_create(part_list); while ((part_ptr = list_next(itr))) { /* we only want to use mps that are in * partitions */ if (!part_ptr->node_bitmap) { debug4("Partition %s doesn't have any nodes in it.", part_ptr->name); continue; } bit_or(usable_mp_bitmap, part_ptr->node_bitmap); } list_iterator_destroy(itr); if (bit_ffs(usable_mp_bitmap) == -1) { fatal("We don't have any nodes in any partitions. " "Can't create blocks. " "Please check your slurm.conf."); } slurm_mutex_lock(&block_state_mutex); reset_ba_system(false); ba_set_removable_mps(usable_mp_bitmap, 1); if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (bg_record->mp_count > 0 && !bg_record->full_block && bg_record->cpu_cnt >= bg_conf->cpus_per_mp) { char *name = NULL; char start_char[SYSTEM_DIMENSIONS+1]; char geo_char[SYSTEM_DIMENSIONS+1]; if (overlapped == LAYOUT_OVERLAP) { reset_ba_system(false); ba_set_removable_mps(usable_mp_bitmap, 1); } /* we want the mps that aren't * in this record to mark them as used */ if (ba_set_removable_mps( bg_record->mp_bitmap, 1) != SLURM_SUCCESS) fatal("It doesn't seem we have a " "bitmap for %s", bg_record->bg_block_id); for (i=0; i<SYSTEM_DIMENSIONS; i++) { geo[i] = bg_record->geo[i]; start_char[i] = alpha_num[ bg_record->start[i]]; geo_char[i] = alpha_num[geo[i]]; } start_char[i] = '\0'; geo_char[i] = '\0'; debug2("adding %s %s %s", bg_record->mp_str, start_char, geo_char); if (bg_record->ba_mp_list && list_count(bg_record->ba_mp_list)) { if ((rc = check_and_set_mp_list( bg_record->ba_mp_list)) != SLURM_SUCCESS) { debug2("something happened in " "the load of %s" "Did you use smap to " "make the " "bluegene.conf file?", bg_record->bg_block_id); break; } } else { #ifdef HAVE_BGQ List results = list_create(destroy_ba_mp); #else List results = list_create(NULL); #endif name = set_bg_block( results, bg_record->start, geo, bg_record->conn_type); ba_reset_all_removed_mps(); if (!name) { error("I was unable to " "make the " "requested block."); list_destroy(results); rc = SLURM_ERROR; break; } snprintf(temp, sizeof(temp), "%s%s", bg_conf->slurm_node_prefix, name); xfree(name); if (strcmp(temp, bg_record->mp_str)) { fatal("given list of %s " "but allocated %s, " "your order might be " "wrong in bluegene.conf", bg_record->mp_str, temp); } if (bg_record->ba_mp_list) list_destroy( bg_record->ba_mp_list); #ifdef HAVE_BGQ bg_record->ba_mp_list = results; results = NULL; #else bg_record->ba_mp_list = list_create(destroy_ba_mp); copy_node_path(results, &bg_record->ba_mp_list); list_destroy(results); #endif } } if (!block_exist_in_list( bg_found_block_list, bg_record)) { if (bg_record->full_block) { /* if this is defined we need to remove it since we are going to try to create it later on overlap systems this doesn't matter, but since we don't clear the table on static mode we can't do it here or it just won't work since other wires will be or are already set */ list_remove(itr); continue; } if ((rc = bridge_block_create(bg_record)) != SLURM_SUCCESS) break; print_bg_record(bg_record); } } list_iterator_destroy(itr); if (rc != SLURM_SUCCESS) goto end_it; } else { error("create_defined_blocks: no bg_lists->main 2"); rc = SLURM_ERROR; goto end_it; } slurm_mutex_unlock(&block_state_mutex); create_full_system_block(bg_found_block_list); slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); end_it: ba_reset_all_removed_mps(); FREE_NULL_BITMAP(usable_mp_bitmap); slurm_mutex_unlock(&block_state_mutex); #ifdef _PRINT_BLOCKS_AND_EXIT if (bg_lists->main) { itr = list_iterator_create(bg_lists->main); debug("\n\n"); while ((found_record = (bg_record_t *) list_next(itr)) != NULL) { print_bg_record(found_record); } list_iterator_destroy(itr); } else { error("create_defined_blocks: no bg_lists->main 5"); } exit(0); #endif /* _PRINT_BLOCKS_AND_EXIT */ //exit(0); return rc; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we * will call the functions without locking the locks again. */ extern int down_nodecard(char *mp_name, bitoff_t io_start, bool slurmctld_locked) { List requests = NULL; List delete_list = NULL; ListIterator itr = NULL; bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; static int io_cnt = NO_VAL; static int create_size = NO_VAL; static select_ba_request_t blockreq; int rc = SLURM_SUCCESS; char *reason = "select_bluegene: nodecard down"; xassert(mp_name); if (io_cnt == NO_VAL) { io_cnt = 1; /* Translate 1 nodecard count to ionode count */ if ((io_cnt *= bg_conf->io_ratio)) io_cnt--; /* make sure we create something that is able to be created */ if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt) create_size = bg_conf->nodecard_cnode_cnt; else create_size = bg_conf->smallest_block; } node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified '%s'", mp_name); return EINVAL; } /* this is here for sanity check to make sure we don't core on these bits when we set them below. */ if (io_start >= bg_conf->ionodes_per_mp || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) { debug("io %d-%d not configured on this " "system, only %d ionodes per midplane", io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.conn_type[0] = SELECT_SMALL; blockreq.save_name = mp_name; debug3("here setting node %d of %d and ionodes %d-%d of %d", mp_bit, node_record_count, io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); memset(&tmp_record, 0, sizeof(bg_record_t)); tmp_record.mp_count = 1; tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt; tmp_record.mp_bitmap = bit_alloc(node_record_count); bit_set(tmp_record.mp_bitmap, mp_bit); tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!blocks_overlap(bg_record, &tmp_record)) continue; if (bg_record->job_running > NO_JOB_RUNNING) { if (slurmctld_locked) job_fail(bg_record->job_running); else slurm_fail_job(bg_record->job_running); } /* If Running Dynamic mode and the block is smaller than the create size just continue on. */ if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) && (bg_record->cnode_cnt < create_size)) { if (!delete_list) delete_list = list_create(NULL); list_append(delete_list, bg_record); continue; } /* keep track of the smallest size that is at least the size of create_size. */ if (!smallest_bg_record || (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt)) smallest_bg_record = bg_record; } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { debug3("running non-dynamic mode"); /* This should never happen, but just in case... */ if (delete_list) list_destroy(delete_list); /* If we found a block that is smaller or equal to a midplane we will just mark it in an error state as opposed to draining the node. */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug("No block under 1 midplane available for this nodecard. " "Draining the whole node."); if (!node_already_down(mp_name)) { if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; } /* below is only for Dynamic mode */ if (delete_list) { int cnt_set = 0; bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp); /* don't lock here since it is handled inside the put_block_in_error_state */ itr = list_iterator_create(delete_list); while ((bg_record = list_next(itr))) { debug2("combining smaller than nodecard " "dynamic block %s", bg_record->bg_block_id); while (bg_record->job_running > NO_JOB_RUNNING) sleep(1); bit_or(iobitmap, bg_record->ionode_bitmap); cnt_set++; } list_iterator_destroy(itr); list_destroy(delete_list); if (!cnt_set) { FREE_NULL_BITMAP(iobitmap); rc = SLURM_ERROR; goto cleanup; } /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ if ((io_start = bit_ffs(iobitmap)) == -1) { io_start = 0; if (create_size > bg_conf->nodecard_cnode_cnt) blockreq.small128 = 4; else blockreq.small32 = 16; } else if (create_size <= bg_conf->nodecard_cnode_cnt) blockreq.small32 = 1; else /* this should never happen */ blockreq.small128 = 1; FREE_NULL_BITMAP(iobitmap); } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } while (smallest_bg_record->job_running > NO_JOB_RUNNING) sleep(1); if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } if (create_size > smallest_bg_record->cnode_cnt) { /* we should never get here. This means we * have a create_size that is bigger than a * block that is already made. */ rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug3("node count is %d", smallest_bg_record->cnode_cnt); switch(smallest_bg_record->cnode_cnt) { #ifndef HAVE_BGL case 64: blockreq.small32 = 2; break; case 256: blockreq.small32 = 8; break; #endif case 128: blockreq.small32 = 4; break; case 512: default: blockreq.small32 = 16; break; } if (create_size != bg_conf->nodecard_cnode_cnt) { blockreq.small128 = blockreq.small32 / 4; blockreq.small32 = 0; io_start = 0; } else if ((io_start = bit_ffs(smallest_bg_record->ionode_bitmap)) == -1) /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ io_start = 0; } else { switch(create_size) { #ifndef HAVE_BGL case 64: blockreq.small64 = 8; break; case 256: blockreq.small256 = 2; #endif case 32: blockreq.small32 = 16; break; case 128: blockreq.small128 = 4; break; case 512: if (!node_already_down(mp_name)) { char *reason = "select_bluegene: nodecard down"; if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes( mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; break; default: error("Unknown create size of %d", create_size); break; } /* since we don't have a block in this midplane we need to start at the beginning. */ io_start = 0; /* we also need a bg_block to pretend to be the smallest block that takes up the entire midplane. */ } /* Here we need to add blocks that take up nodecards on this midplane. Since Slurm only keeps track of midplanes natively this is the only want to handle this case. */ requests = list_create(destroy_bg_record); add_bg_record(requests, NULL, &blockreq, 1, io_start); slurm_mutex_lock(&block_state_mutex); delete_list = list_create(NULL); while ((bg_record = list_pop(requests))) { itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); list_remove(itr); } list_iterator_destroy(itr); /* we need to add this record since it doesn't exist */ if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("down_sub_node_blocks: " "unable to configure block in api"); continue; } debug("adding block %s to fill in small blocks " "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); list_append(bg_lists->main, bg_record); if (bit_overlap(bg_record->ionode_bitmap, tmp_record.ionode_bitmap)) { /* here we know the error block doesn't exist so just set the state here */ slurm_mutex_unlock(&block_state_mutex); rc = put_block_in_error_state(bg_record, reason); slurm_mutex_lock(&block_state_mutex); } } list_destroy(requests); if (delete_list) { slurm_mutex_unlock(&block_state_mutex); free_block_list(NO_VAL, delete_list, 0, 0); list_destroy(delete_list); } slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); last_bg_update = time(NULL); cleanup: FREE_NULL_BITMAP(tmp_record.mp_bitmap); FREE_NULL_BITMAP(tmp_record.ionode_bitmap); return rc; }