void taskset_cycle( taskset_t *ts ) { hb_lock( ts->task_cond_lock ); /* * Signal all threads that their work is available. */ bit_nset( ts->task_begin_bitmap, 0, ts->thread_count - 1 ); hb_cond_broadcast( ts->task_begin ); /* * Wait until all threads have completed. Note that we must * loop here as hb_cond_wait() on some platforms (e.g pthead_cond_wait) * may unblock prematurely. */ do { hb_cond_wait( ts->task_complete, ts->task_cond_lock ); } while ( !allbits_set( ts->task_complete_bitmap, ts->bitmap_elements ) ); /* * Clear completion indications for next time. */ bit_nclear( ts->task_complete_bitmap, 0, ts->thread_count - 1 ); hb_unlock( ts->task_cond_lock ); }
static void cachedev_setbits (bitstr_t *bitmap, ioreq_event *req) { #ifdef DEBUG_CACHEDEV fprintf(outputfile, "*** %f: Entered cachedev::cachedev_setbits from block %d to %d\n", simtime, req->blkno, req->blkno+req->bcount-1 ); fflush(outputfile); #endif bit_nset (bitmap, req->blkno, (req->blkno+req->bcount-1)); }
void taskset_fini( taskset_t *ts ) { int i; hb_lock( ts->task_cond_lock ); /* * Tell each thread to stop, and then cleanup. */ bit_nset( ts->task_stop_bitmap, 0, ts->thread_count - 1 ); bit_nset( ts->task_begin_bitmap, 0, ts->thread_count - 1 ); hb_cond_broadcast( ts->task_begin ); /* * Wait for all threads to exit. */ hb_cond_wait( ts->task_complete, ts->task_cond_lock ); hb_unlock( ts->task_cond_lock ); /* * Clean up taskset memory. */ for( i = 0; i < ts->thread_count; i++) { hb_thread_close( &ts->task_threads[i] ); } hb_lock_close( &ts->task_cond_lock ); hb_cond_close( &ts->task_begin ); hb_cond_close( &ts->task_complete ); free( ts->task_threads ); if( ts->task_threads_args != NULL ) free( ts->task_threads_args ); free( ts->task_begin_bitmap ); free( ts->task_complete_bitmap ); free( ts->task_stop_bitmap ); }
/* Remove any specialized cores from those allocated to the job */ static void _clear_spec_cores(struct job_record *job_ptr, bitstr_t *avail_core_bitmap) { int first_node, last_node, i_node; int first_core, last_core, i_core; int alloc_node = -1, alloc_core = -1, size; job_resources_t *job_res = job_ptr->job_resrcs; multi_core_data_t *mc_ptr = NULL; if (job_ptr->details && job_ptr->details->mc_ptr) mc_ptr = job_ptr->details->mc_ptr; size = bit_size(job_res->core_bitmap); bit_nset(job_res->core_bitmap, 0, size - 1); first_node = bit_ffs(job_res->node_bitmap); if (first_node >= 0) last_node = bit_fls(job_res->node_bitmap); else last_node = first_node - 1; for (i_node = first_node; i_node <= last_node; i_node++) { if (!bit_test(job_res->node_bitmap, i_node)) continue; job_res->cpus[++alloc_node] = 0; first_core = cr_get_coremap_offset(i_node); last_core = cr_get_coremap_offset(i_node + 1) - 1; for (i_core = first_core; i_core <= last_core; i_core++) { alloc_core++; if (bit_test(avail_core_bitmap, i_core)) { uint16_t tpc = select_node_record[i_node].vpus; if (mc_ptr && (mc_ptr->threads_per_core != NO_VAL16) && (mc_ptr->threads_per_core < tpc)) tpc = mc_ptr->threads_per_core; job_res->cpus[alloc_node] += tpc; } else { bit_clear(job_res->core_bitmap, alloc_core); } } } }
/* helper function for _expand_masks() */ static void _blot_mask(bitstr_t *mask, uint16_t blot) { uint16_t i, size = 0; int prev = -1; if (!mask) return; size = bit_size(mask); for (i = 0; i < size; i++) { if (bit_test(mask, i)) { /* fill in this blot */ uint16_t start = (i / blot) * blot; if (start != prev) { bit_nset(mask, start, start+blot-1); prev = start; } } } }
/* Convert node name string to equivalent nid string */ static char *_node_names_2_nid_list(char *node_names) { char *nid_list = NULL; int i, last_nid_index = -1; bool is_dash = false; bitstr_t *node_bitmap; node_bitmap = bit_alloc(100000); for (i = 0; node_names[i]; i++) { int nid_index = 0; /* skip "nid[" */ if ((node_names[i] < '0') || (node_names[i] > '9')) continue; /* skip leading zeros */ while (node_names[i] == '0') i++; if (node_names[i] == '[') i++; while ((node_names[i] >= '0') && (node_names[i] <= '9')) { nid_index *= 10; nid_index += (node_names[i++] - '0'); } if (is_dash && (nid_index >= last_nid_index)) { bit_nset(node_bitmap, last_nid_index, nid_index); } else { bit_set(node_bitmap, nid_index); } if ((is_dash = (node_names[i] == '-'))) last_nid_index = nid_index; else if (node_names[i] == '\0') break; } i = strlen(node_names) + 1; nid_list = xmalloc(i); bit_fmt(nid_list, i, node_bitmap); bit_free(node_bitmap); return nid_list; }
static int _unpack_node_subgrp(node_subgrp_t **subgrp_pptr, Buf buffer, uint16_t bitmap_size, uint16_t protocol_version) { node_subgrp_t *subgrp = xmalloc(sizeof(node_subgrp_t)); int j; uint32_t uint32_tmp; uint16_t uint16_tmp; *subgrp_pptr = subgrp; if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { safe_unpackstr_xmalloc(&subgrp->str, &uint32_tmp, buffer); if (!subgrp->str) subgrp->inx = bitfmt2int(""); else subgrp->inx = bitfmt2int(subgrp->str); subgrp->bitmap = bit_alloc(bitmap_size); j = 0; while (subgrp->inx[j] >= 0) { bit_nset(subgrp->bitmap, subgrp->inx[j], subgrp->inx[j+1]); j+=2; } safe_unpack16(&subgrp->cnode_cnt, buffer); safe_unpack16(&uint16_tmp, buffer); subgrp->state = uint16_tmp; } return SLURM_SUCCESS; unpack_error: _free_node_subgrp(subgrp); *subgrp_pptr = NULL; return SLURM_ERROR; }
/* a helper function for _add_job_to_active when GS_SOCKET * a job has just been added to p_ptr->active_resmap, so set all cores of * each used socket to avoid activating another job on the same socket */ static void _fill_sockets(bitstr_t *job_nodemap, struct gs_part *p_ptr) { uint32_t c, i; int n, first_bit, last_bit; if (!job_nodemap || !p_ptr || !p_ptr->active_resmap) return; first_bit = bit_ffs(job_nodemap); last_bit = bit_fls(job_nodemap); if ((first_bit < 0) || (last_bit < 0)) fatal("gang: _afill_sockets: nodeless job?"); for (c = 0, n = 0; n < first_bit; n++) { c += _get_phys_bit_cnt(n); } for (n = first_bit; n <= last_bit; n++) { uint16_t s, socks, cps, cores_per_node; cores_per_node = _get_phys_bit_cnt(n); if (bit_test(job_nodemap, n) == 0) { c += cores_per_node; continue; } socks = _get_socket_cnt(n); cps = cores_per_node / socks; for (s = 0; s < socks; s++) { for (i = c; i < c+cps; i++) { if (bit_test(p_ptr->active_resmap, i)) break; } if (i < c+cps) { /* set all bits on this used socket */ bit_nset(p_ptr->active_resmap, c, c+cps-1); } c += cps; } } }
/* * Read and process the bluegene.conf configuration file so to interpret what * blocks are static/dynamic, torus/mesh, etc. */ extern int read_bg_conf(void) { int i; bool tmp_bool = 0; int count = 0; s_p_hashtbl_t *tbl = NULL; char *tmp_char = NULL; select_ba_request_t **blockreq_array = NULL; image_t **image_array = NULL; image_t *image = NULL; static time_t last_config_update = (time_t) 0; struct stat config_stat; ListIterator itr = NULL; char* bg_conf_file = NULL; static int *dims = NULL; if (!dims) dims = select_g_ba_get_dims(); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Reading the bluegene.conf file"); /* check if config file has changed */ bg_conf_file = get_extra_conf_path("bluegene.conf"); if (stat(bg_conf_file, &config_stat) < 0) fatal("can't stat bluegene.conf file %s: %m", bg_conf_file); if (last_config_update) { _reopen_bridge_log(); if (last_config_update == config_stat.st_mtime) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s unchanged", bg_conf_file); } else { info("Restart slurmctld for %s changes " "to take effect", bg_conf_file); } last_config_update = config_stat.st_mtime; xfree(bg_conf_file); return SLURM_SUCCESS; } last_config_update = config_stat.st_mtime; /* initialization */ /* bg_conf defined in bg_node_alloc.h */ if (!(tbl = config_make_tbl(bg_conf_file))) fatal("something wrong with opening/reading bluegene " "conf file"); xfree(bg_conf_file); #ifdef HAVE_BGL if (s_p_get_array((void ***)&image_array, &count, "AltBlrtsImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->blrts_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_blrtsimage, "BlrtsImage", tbl)) { if (!list_count(bg_conf->blrts_list)) fatal("BlrtsImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->blrts_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_blrtsimage = xstrdup(image->name); info("Warning: using %s as the default BlrtsImage. " "If this isn't correct please set BlrtsImage", bg_conf->default_blrtsimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default BlrtsImage %s", bg_conf->default_blrtsimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_blrtsimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->blrts_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltLinuxImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "LinuxImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("LinuxImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default LinuxImage. " "If this isn't correct please set LinuxImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default LinuxImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltRamDiskImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "RamDiskImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("RamDiskImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default RamDiskImage. " "If this isn't correct please set RamDiskImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default RamDiskImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #elif defined HAVE_BGP if (s_p_get_array((void ***)&image_array, &count, "AltCnloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "CnloadImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("CnloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default CnloadImage. " "If this isn't correct please set CnloadImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default CnloadImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltIoloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "IoloadImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("IoloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default IoloadImage. " "If this isn't correct please set IoloadImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default IoloadImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #endif if (s_p_get_array((void ***)&image_array, &count, "AltMloaderImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->mloader_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_mloaderimage, "MloaderImage", tbl)) { if (!list_count(bg_conf->mloader_list)) fatal("MloaderImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->mloader_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_mloaderimage = xstrdup(image->name); info("Warning: using %s as the default MloaderImage. " "If this isn't correct please set MloaderImage", bg_conf->default_mloaderimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default MloaderImage %s", bg_conf->default_mloaderimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_mloaderimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->mloader_list, image); } if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "MidplaneNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "BasePartitionNodeCnt", tbl)) { error("MidplaneNodeCnt not configured in bluegene.conf " "defaulting to 512 as MidplaneNodeCnt"); bg_conf->mp_cnode_cnt = 512; } } if (bg_conf->mp_cnode_cnt <= 0) fatal("You should have more than 0 nodes " "per midplane"); bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt; bg_conf->quarter_cnode_cnt = bg_conf->mp_cnode_cnt/4; /* bg_conf->cpus_per_mp should had already been set from the * node_init */ if (bg_conf->cpus_per_mp < bg_conf->mp_cnode_cnt) { fatal("For some reason we have only %u cpus per mp, but " "have %u cnodes per mp. You need at least the same " "number of cpus as you have cnodes per mp. " "Check the NodeName CPUs= " "definition in the slurm.conf.", bg_conf->cpus_per_mp, bg_conf->mp_cnode_cnt); } bg_conf->cpu_ratio = bg_conf->cpus_per_mp/bg_conf->mp_cnode_cnt; if (!bg_conf->cpu_ratio) fatal("We appear to have less than 1 cpu on a cnode. " "You specified %u for MidplaneNodeCnt " "in the blugene.conf and %u cpus " "for each node in the slurm.conf", bg_conf->mp_cnode_cnt, bg_conf->cpus_per_mp); num_unused_cpus = 1; for (i = 0; i<SYSTEM_DIMENSIONS; i++) num_unused_cpus *= dims[i]; num_unused_cpus *= bg_conf->cpus_per_mp; num_possible_unused_cpus = num_unused_cpus; if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeBoardNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeCardNodeCnt", tbl)) { error("NodeCardNodeCnt not configured in bluegene.conf " "defaulting to 32 as NodeCardNodeCnt"); bg_conf->nodecard_cnode_cnt = 32; } } if (bg_conf->nodecard_cnode_cnt <= 0) fatal("You should have more than 0 nodes per nodecard"); bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt / bg_conf->nodecard_cnode_cnt; if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "IONodesPerMP", tbl)) if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "Numpsets", tbl)) fatal("Warning: IONodesPerMP not configured " "in bluegene.conf"); s_p_get_uint16(&bg_conf->max_block_err, "MaxBlockInError", tbl); tmp_bool = 0; s_p_get_boolean(&tmp_bool, "SubMidplaneSystem", tbl); bg_conf->sub_mp_sys = tmp_bool; #ifdef HAVE_BGQ tmp_bool = 0; s_p_get_boolean(&tmp_bool, "AllowSubBlockAllocations", tbl); bg_conf->sub_blocks = tmp_bool; /* You can only have 16 ionodes per midplane */ if (bg_conf->ionodes_per_mp > bg_conf->mp_nodecard_cnt) bg_conf->ionodes_per_mp = bg_conf->mp_nodecard_cnt; #endif for (i=0; i<SYSTEM_DIMENSIONS; i++) bg_conf->default_conn_type[i] = (uint16_t)NO_VAL; s_p_get_string(&tmp_char, "DefaultConnType", tbl); if (tmp_char) { verify_conn_type(tmp_char, bg_conf->default_conn_type); if ((bg_conf->default_conn_type[0] != SELECT_MESH) && (bg_conf->default_conn_type[0] != SELECT_TORUS)) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); xfree(tmp_char); } else bg_conf->default_conn_type[0] = SELECT_TORUS; #ifndef HAVE_BG_L_P int first_conn_type = bg_conf->default_conn_type[0]; for (i=1; i<SYSTEM_DIMENSIONS; i++) { if (bg_conf->default_conn_type[i] == (uint16_t)NO_VAL) bg_conf->default_conn_type[i] = first_conn_type; else if (bg_conf->default_conn_type[i] >= SELECT_SMALL) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); } #endif if (bg_conf->ionodes_per_mp) { bitstr_t *tmp_bitmap = NULL; int small_size = 1; /* THIS IS A HACK TO MAKE A 1 NODECARD SYSTEM WORK, * Sometime on a Q system the nodecard isn't in the 0 * spot so only do this if you know it is in that * spot. Otherwise say the whole midplane is there * and just make blocks over the whole thing. They * you can error out the blocks that aren't usable. */ if (bg_conf->sub_mp_sys && bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) { #ifdef HAVE_BGQ bg_conf->quarter_ionode_cnt = 1; bg_conf->nodecard_ionode_cnt = 1; #else bg_conf->quarter_ionode_cnt = 2; bg_conf->nodecard_ionode_cnt = 2; #endif } else { bg_conf->quarter_ionode_cnt = bg_conf->ionodes_per_mp/4; bg_conf->nodecard_ionode_cnt = bg_conf->quarter_ionode_cnt/4; } /* How many nodecards per ionode */ bg_conf->nc_ratio = ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt) / (double)bg_conf->ionodes_per_mp; /* How many ionodes per nodecard */ bg_conf->io_ratio = (double)bg_conf->ionodes_per_mp / ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt); /* How many cnodes per ionode */ bg_conf->ionode_cnode_cnt = bg_conf->nodecard_cnode_cnt * bg_conf->nc_ratio; //info("got %f %f", bg_conf->nc_ratio, bg_conf->io_ratio); /* figure out the smallest block we can have on the system */ #ifdef HAVE_BGL if (bg_conf->io_ratio >= 1) bg_conf->smallest_block=32; else bg_conf->smallest_block=128; #else if (bg_conf->io_ratio >= 2) bg_conf->smallest_block=16; else if (bg_conf->io_ratio == 1) bg_conf->smallest_block=32; else if (bg_conf->io_ratio == .5) bg_conf->smallest_block=64; else if (bg_conf->io_ratio == .25) bg_conf->smallest_block=128; else if (bg_conf->io_ratio == .125) bg_conf->smallest_block=256; else { error("unknown ioratio %f. Can't figure out " "smallest block size, setting it to midplane", bg_conf->io_ratio); bg_conf->smallest_block = 512; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Smallest block possible on this system is %u", bg_conf->smallest_block); /* below we are creating all the possible bitmaps for * each size of small block */ if ((int)bg_conf->nodecard_ionode_cnt < 1) { bg_conf->nodecard_ionode_cnt = 0; } else { bg_lists->valid_small32 = list_create(_destroy_bitmap); /* This is suppose to be = and not ==, we only want to decrement when small_size equals something. */ if ((small_size = bg_conf->nodecard_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small32, tmp_bitmap); } } /* If we only have 1 nodecard just jump to the end since this will never need to happen below. Pretty much a hack to avoid seg fault;). */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) goto no_calc; bg_lists->valid_small128 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small128, tmp_bitmap); } #ifndef HAVE_BGL bg_lists->valid_small64 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small64, tmp_bitmap); } bg_lists->valid_small256 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small256, tmp_bitmap); } #endif } else { fatal("your ionodes_per_mp is 0"); } no_calc: if (!s_p_get_uint16(&bg_conf->bridge_api_verb, "BridgeAPIVerbose", tbl)) info("Warning: BridgeAPIVerbose not configured " "in bluegene.conf"); if (!s_p_get_string(&bg_conf->bridge_api_file, "BridgeAPILogFile", tbl)) info("BridgeAPILogFile not configured in bluegene.conf"); else _reopen_bridge_log(); if (s_p_get_string(&tmp_char, "DenyPassthrough", tbl)) { if (strstr(tmp_char, "A")) ba_deny_pass |= PASS_DENY_A; if (strstr(tmp_char, "X")) ba_deny_pass |= PASS_DENY_X; if (strstr(tmp_char, "Y")) ba_deny_pass |= PASS_DENY_Y; if (strstr(tmp_char, "Z")) ba_deny_pass |= PASS_DENY_Z; if (!xstrcasecmp(tmp_char, "ALL")) ba_deny_pass |= PASS_DENY_ALL; bg_conf->deny_pass = ba_deny_pass; xfree(tmp_char); } if (!s_p_get_string(&tmp_char, "LayoutMode", tbl)) { info("Warning: LayoutMode was not specified in bluegene.conf " "defaulting to STATIC partitioning"); bg_conf->layout_mode = LAYOUT_STATIC; } else { if (!xstrcasecmp(tmp_char,"STATIC")) bg_conf->layout_mode = LAYOUT_STATIC; else if (!xstrcasecmp(tmp_char,"OVERLAP")) bg_conf->layout_mode = LAYOUT_OVERLAP; else if (!xstrcasecmp(tmp_char,"DYNAMIC")) bg_conf->layout_mode = LAYOUT_DYNAMIC; else { fatal("I don't understand this LayoutMode = %s", tmp_char); } xfree(tmp_char); } /* add blocks defined in file */ if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { if (!s_p_get_array((void ***)&blockreq_array, &count, "MPs", tbl)) { if (!s_p_get_array((void ***)&blockreq_array, &count, "BPs", tbl)) { info("WARNING: no blocks defined in " "bluegene.conf, " "only making full system block"); /* create_full_system_block(NULL); */ if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) fatal("On a sub-midplane system you " "need to define the blocks you " "want on your system."); } } for (i = 0; i < count; i++) { add_bg_record(bg_lists->main, NULL, blockreq_array[i], 0, 0); } } else if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) /* we can't do dynamic here on a sub-midplane system */ fatal("On a sub-midplane system we can only do OVERLAP or " "STATIC LayoutMode. Please update your bluegene.conf."); #ifdef HAVE_BGQ if ((bg_recover != NOT_FROM_CONTROLLER) && assoc_mgr_qos_list && s_p_get_string(&tmp_char, "RebootQOSList", tbl)) { bool valid; char *token, *last = NULL; slurmdb_qos_rec_t *qos = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Lock here to avoid g_qos_count changing under us */ assoc_mgr_lock(&locks); bg_conf->reboot_qos_bitmap = bit_alloc(g_qos_count); itr = list_iterator_create(assoc_mgr_qos_list); token = strtok_r(tmp_char, ",", &last); while (token) { valid = false; while((qos = list_next(itr))) { if (!xstrcasecmp(token, qos->name)) { bit_set(bg_conf->reboot_qos_bitmap, qos->id); valid = true; break; } } if (!valid) error("Invalid RebootQOSList value: %s", token); list_iterator_reset(itr); token = strtok_r(NULL, ",", &last); } list_iterator_destroy(itr); xfree(tmp_char); assoc_mgr_unlock(&locks); } #endif s_p_hashtbl_destroy(tbl); return SLURM_SUCCESS; }
extern int select_nodeinfo_set_all(void) { ListIterator itr = NULL; struct node_record *node_ptr = NULL; int i=0; bg_record_t *bg_record = NULL; static time_t last_set_all = 0; ba_mp_t *ba_mp; node_subgrp_t *subgrp = NULL; int bit_count; //uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (!blocks_are_created) return SLURM_NO_CHANGE_IN_DATA; if (!g_bitmap_size) { /* if (cluster_flags & CLUSTER_FLAG_BGQ) */ /* g_bitmap_size = bg_conf->mp_cnode_cnt; */ /* else */ g_bitmap_size = bg_conf->ionodes_per_mp; } /* only set this once when the last_bg_update is newer than the last time we set things up. */ if (last_set_all && (last_bg_update-1 < last_set_all)) { debug2("Node select info for set all hasn't " "changed since %ld", last_set_all); return SLURM_NO_CHANGE_IN_DATA; } last_set_all = last_bg_update; /* set this here so we know things have changed */ last_node_update = time(NULL); slurm_mutex_lock(&block_state_mutex); for (i=0; i<node_record_count; i++) { select_nodeinfo_t *nodeinfo; node_ptr = &(node_record_table_ptr[i]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); list_flush(nodeinfo->subgrp_list); if (nodeinfo->bitmap_size != g_bitmap_size) nodeinfo->bitmap_size = g_bitmap_size; } itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { enum node_states state = NODE_STATE_UNKNOWN; select_nodeinfo_t *nodeinfo; bitstr_t *bitmap; ListIterator itr2 = NULL; /* Only mark unidle blocks */ if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; select_jobinfo_t *jobinfo; ListIterator itr = list_iterator_create(bg_record->job_list); ba_mp = list_peek(bg_record->ba_mp_list); node_ptr = &(node_record_table_ptr[ba_mp->index]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); if (ba_mp->cnode_err_bitmap && (bit_count = bit_set_count(ba_mp->cnode_err_bitmap))) { subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ERROR, g_bitmap_size); /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += bit_count; } subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ALLOCATED, g_bitmap_size); while ((job_ptr = list_next(itr))) { jobinfo = job_ptr->select_jobinfo->data; /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += jobinfo->cnode_cnt; } list_iterator_destroy(itr); continue; } else if (bg_record->job_running == NO_JOB_RUNNING) continue; if (bg_record->state & BG_BLOCK_ERROR_FLAG) state = NODE_STATE_ERROR; else if (bg_record->job_running > NO_JOB_RUNNING) { /* we don't need to set the allocated here * since the whole midplane is allocated */ if (bg_record->conn_type[0] < SELECT_SMALL) continue; state = NODE_STATE_ALLOCATED; } else { error("not sure why we got here with block %s %s", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); continue; } /* if ((cluster_flags & CLUSTER_FLAG_BGQ) */ /* && (state != NODE_STATE_ERROR)) */ /* bitmap = bg_record->cnodes_used_bitmap; */ /* else */ bitmap = bg_record->ionode_bitmap; itr2 = list_iterator_create(bg_record->ba_mp_list); while ((ba_mp = list_next(itr2))) { if (!ba_mp->used) continue; node_ptr = &(node_record_table_ptr[ba_mp->index]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); if (ba_mp->cnode_err_bitmap && (state == NODE_STATE_ALLOCATED) && (bit_count = bit_set_count(ba_mp->cnode_err_bitmap))) { subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ERROR, g_bitmap_size); /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += bit_count; } subgrp = _find_subgrp(nodeinfo->subgrp_list, state, g_bitmap_size); if (subgrp->cnode_cnt < bg_conf->mp_cnode_cnt) { /* if (cluster_flags & CLUSTER_FLAG_BGQ) { */ /* bit_or(subgrp->bitmap, bitmap); */ /* subgrp->cnode_cnt += */ /* bit_set_count(bitmap); */ /* } else */ if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) { bit_or(subgrp->bitmap, bitmap); subgrp->cnode_cnt += bg_record->cnode_cnt; } else { bit_nset(subgrp->bitmap, 0, (g_bitmap_size-1)); subgrp->cnode_cnt = bg_conf->mp_cnode_cnt; } } } list_iterator_destroy(itr2); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; }
int main(int argc, char **argv) { int c; int count, waittime; int set_lun; int fd, retval; struct ctlstat_context ctx; /* default values */ retval = 0; waittime = 1; count = -1; memset(&ctx, 0, sizeof(ctx)); ctx.numdevs = 3; ctx.mode = CTLSTAT_MODE_STANDARD; ctx.flags |= CTLSTAT_FLAG_CPU; ctx.flags |= CTLSTAT_FLAG_FIRST_RUN; ctx.flags |= CTLSTAT_FLAG_HEADER; while ((c = getopt(argc, argv, ctlstat_opts)) != -1) { switch (c) { case 'C': ctx.flags &= ~CTLSTAT_FLAG_CPU; break; case 'c': count = atoi(optarg); break; case 'd': ctx.flags |= CTLSTAT_FLAG_DMA_TIME; break; case 'D': ctx.mode = CTLSTAT_MODE_DUMP; waittime = 30; break; case 'h': ctx.flags &= ~CTLSTAT_FLAG_HEADER; break; case 'j': ctx.mode = CTLSTAT_MODE_JSON; waittime = 30; break; case 'l': { int cur_lun; cur_lun = atoi(optarg); if (cur_lun > CTL_STAT_LUN_BITS) errx(1, "Invalid LUN number %d", cur_lun); bit_ffs(ctx.lun_mask, CTL_STAT_LUN_BITS, &set_lun); if (set_lun == -1) ctx.numdevs = 1; else ctx.numdevs++; bit_set(ctx.lun_mask, cur_lun); break; } case 'n': ctx.numdevs = atoi(optarg); break; case 't': ctx.flags |= CTLSTAT_FLAG_TOTALS; ctx.numdevs = 3; break; case 'w': waittime = atoi(optarg); break; default: retval = 1; usage(retval); exit(retval); break; } } bit_ffs(ctx.lun_mask, CTL_STAT_LUN_BITS, &set_lun); if ((F_TOTALS(&ctx)) && (set_lun != -1)) { errx(1, "Total Mode (-t) is incompatible with individual " "LUN mode (-l)"); } else if (set_lun == -1) { /* * Note that this just selects the first N LUNs to display, * but at this point we have no knoweledge of which LUN * numbers actually exist. So we may select LUNs that * aren't there. */ bit_nset(ctx.lun_mask, 0, min(ctx.numdevs - 1, CTL_STAT_LUN_BITS - 1)); } if ((fd = open(CTL_DEFAULT_DEV, O_RDWR)) == -1) err(1, "cannot open %s", CTL_DEFAULT_DEV); for (;count != 0;) { ctx.tmp_lun_stats = ctx.prev_lun_stats; ctx.prev_lun_stats = ctx.cur_lun_stats; ctx.cur_lun_stats = ctx.tmp_lun_stats; ctx.prev_time = ctx.cur_time; ctx.prev_cpu = ctx.cur_cpu; if (getstats(fd, &ctx.num_luns, &ctx.cur_lun_stats, &ctx.cur_time, &ctx.flags) != 0) errx(1, "error returned from getstats()"); switch(ctx.mode) { case CTLSTAT_MODE_STANDARD: ctlstat_standard(&ctx); break; case CTLSTAT_MODE_DUMP: ctlstat_dump(&ctx); break; case CTLSTAT_MODE_JSON: ctlstat_json(&ctx); break; default: break; } fprintf(stdout, "\n"); ctx.flags &= ~CTLSTAT_FLAG_FIRST_RUN; if (count != 1) sleep(waittime); if (count > 0) count--; } exit (retval); }
static int _handle_completion(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int first; int last; jobacctinfo_t *jobacct = NULL; int step_rc; debug("_handle_completion for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("step completion message from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; } safe_read(fd, &first, sizeof(int)); safe_read(fd, &last, sizeof(int)); safe_read(fd, &step_rc, sizeof(int)); jobacct = jobacct_gather_g_create(NULL); jobacct_gather_g_getinfo(jobacct, JOBACCT_DATA_PIPE, &fd); /* * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); if (! step_complete.wait_children) { rc = -1; errnum = ETIMEDOUT; /* not used anyway */ goto timeout; } /* SlurmUser or root can craft a launch without a valid credential * ("srun --no-alloc ...") and no tree information can be built * without the hostlist from the credential. */ if (step_complete.rank >= 0) { #if 0 char bits_string[128]; debug2("Setting range %d (bit %d) through %d(bit %d)", first, first-(step_complete.rank+1), last, last-(step_complete.rank+1)); bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" before bits: %s", bits_string); #endif bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); #if 0 bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" after bits: %s", bits_string); #endif } step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ jobacct_gather_g_aggregate(step_complete.jobacct, jobacct); timeout: jobacct_gather_g_destroy(jobacct); /*********************************************/ /* Send the return code and errno, we do this within the locked * region to ensure that the stepd doesn't exit before we can * perform this send. */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
/* return NULL if eof or syntax error occurs; * otherwise return a pointer to a new entry. */ entry * load_entry(FILE *file, void (*error_func)(const char *), struct passwd *pw, char **envp) { /* this function reads one crontab entry -- the next -- from a file. * it skips any leading blank lines, ignores comments, and returns * NULL if for any reason the entry can't be read and parsed. * * the entry is also parsed here. * * syntax: * user crontab: * minutes hours doms months dows cmd\n * system crontab (/etc/crontab): * minutes hours doms months dows USERNAME cmd\n */ ecode_e ecode = e_none; entry *e; int ch; char cmd[MAX_COMMAND]; char envstr[MAX_ENVSTR]; char **tenvp; Debug(DPARS, ("load_entry()...about to eat comments\n")); skip_comments(file); ch = get_char(file); if (ch == EOF) return (NULL); /* ch is now the first useful character of a useful line. * it may be an @special or it may be the first character * of a list of minutes. */ e = calloc(sizeof(*e), sizeof(char)); if (ch == '@') { /* all of these should be flagged and load-limited; i.e., * instead of @hourly meaning "0 * * * *" it should mean * "close to the front of every hour but not 'til the * system load is low". Problems are: how do you know * what "low" means? (save me from /etc/cron.conf!) and: * how to guarantee low variance (how low is low?), which * means how to we run roughly every hour -- seems like * we need to keep a history or let the first hour set * the schedule, which means we aren't load-limited * anymore. too much for my overloaded brain. (vix, jan90) * HINT */ ch = get_string(cmd, MAX_COMMAND, file, " \t\n"); if (!strcmp("reboot", cmd)) { e->flags |= WHEN_REBOOT; } else if (!strcmp("yearly", cmd) || !strcmp("annually", cmd)){ bit_set(e->minute, 0); bit_set(e->hour, 0); bit_set(e->dom, 0); bit_set(e->month, 0); bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); e->flags |= DOW_STAR; } else if (!strcmp("monthly", cmd)) { bit_set(e->minute, 0); bit_set(e->hour, 0); bit_set(e->dom, 0); bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); e->flags |= DOW_STAR; } else if (!strcmp("weekly", cmd)) { bit_set(e->minute, 0); bit_set(e->hour, 0); bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); bit_set(e->dow, 0); e->flags |= DOM_STAR; } else if (!strcmp("daily", cmd) || !strcmp("midnight", cmd)) { bit_set(e->minute, 0); bit_set(e->hour, 0); bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); e->flags |= DOM_STAR | DOW_STAR; } else if (!strcmp("hourly", cmd)) { bit_set(e->minute, 0); bit_nset(e->hour, 0, (LAST_HOUR-FIRST_HOUR+1)); bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); e->flags |= DOM_STAR | DOW_STAR; } else { ecode = e_timespec; goto eof; } /* Advance past whitespace between shortcut and * username/command. */ Skip_Blanks(ch, file); if (ch == EOF || ch == '\n') { ecode = e_cmd; goto eof; } } else { Debug(DPARS, ("load_entry()...about to parse numerics\n")); if (ch == '*') e->flags |= MIN_STAR; ch = get_list(e->minute, FIRST_MINUTE, LAST_MINUTE, PPC_NULL, ch, file); if (ch == EOF) { ecode = e_minute; goto eof; } /* hours */ if (ch == '*') e->flags |= HR_STAR; ch = get_list(e->hour, FIRST_HOUR, LAST_HOUR, PPC_NULL, ch, file); if (ch == EOF) { ecode = e_hour; goto eof; } /* DOM (days of month) */ if (ch == '*') e->flags |= DOM_STAR; ch = get_list(e->dom, FIRST_DOM, LAST_DOM, PPC_NULL, ch, file); if (ch == EOF) { ecode = e_dom; goto eof; } /* month */ ch = get_list(e->month, FIRST_MONTH, LAST_MONTH, MonthNames, ch, file); if (ch == EOF) { ecode = e_month; goto eof; } /* DOW (days of week) */ if (ch == '*') e->flags |= DOW_STAR; ch = get_list(e->dow, FIRST_DOW, LAST_DOW, DowNames, ch, file); if (ch == EOF) { ecode = e_dow; goto eof; } } /* make sundays equivalent */ if (bit_test(e->dow, 0) || bit_test(e->dow, 7)) { bit_set(e->dow, 0); bit_set(e->dow, 7); } /* check for permature EOL and catch a common typo */ if (ch == '\n' || ch == '*') { ecode = e_cmd; goto eof; } /* ch is the first character of a command, or a username */ unget_char(ch, file); if (!pw) { char *username = cmd; /* temp buffer */ Debug(DPARS, ("load_entry()...about to parse username\n")); ch = get_string(username, MAX_COMMAND, file, " \t\n"); Debug(DPARS, ("load_entry()...got %s\n",username)); if (ch == EOF || ch == '\n' || ch == '*') { ecode = e_cmd; goto eof; } pw = getpwnam(username); if (pw == NULL) { ecode = e_username; goto eof; } Debug(DPARS, ("load_entry()...uid %ld, gid %ld\n", (long)pw->pw_uid, (long)pw->pw_gid)); } if ((e->pwd = pw_dup(pw)) == NULL) { ecode = e_memory; goto eof; } (void)memset(e->pwd->pw_passwd, 0, strlen(e->pwd->pw_passwd)); /* copy and fix up environment. some variables are just defaults and * others are overrides. */ if ((e->envp = env_copy(envp)) == NULL) { ecode = e_memory; goto eof; } if (!env_get("SHELL", e->envp)) { if (glue_strings(envstr, sizeof envstr, "SHELL", _PATH_BSHELL, '=')) { if ((tenvp = env_set(e->envp, envstr)) == NULL) { ecode = e_memory; goto eof; } e->envp = tenvp; } else log_it("CRON", getpid(), "error", "can't set SHELL"); } if (!env_get("HOME", e->envp)) { if (glue_strings(envstr, sizeof envstr, "HOME", pw->pw_dir, '=')) { if ((tenvp = env_set(e->envp, envstr)) == NULL) { ecode = e_memory; goto eof; } e->envp = tenvp; } else log_it("CRON", getpid(), "error", "can't set HOME"); } /* If login.conf is in used we will get the default PATH later. */ if (!env_get("PATH", e->envp)) { if (glue_strings(envstr, sizeof envstr, "PATH", _PATH_DEFPATH, '=')) { if ((tenvp = env_set(e->envp, envstr)) == NULL) { ecode = e_memory; goto eof; } e->envp = tenvp; } else log_it("CRON", getpid(), "error", "can't set PATH"); } if (glue_strings(envstr, sizeof envstr, "LOGNAME", pw->pw_name, '=')) { if ((tenvp = env_set(e->envp, envstr)) == NULL) { ecode = e_memory; goto eof; } e->envp = tenvp; } else log_it("CRON", getpid(), "error", "can't set LOGNAME"); #if defined(BSD) || defined(__linux) if (glue_strings(envstr, sizeof envstr, "USER", pw->pw_name, '=')) { if ((tenvp = env_set(e->envp, envstr)) == NULL) { ecode = e_memory; goto eof; } e->envp = tenvp; } else log_it("CRON", getpid(), "error", "can't set USER"); #endif Debug(DPARS, ("load_entry()...about to parse command\n")); /* If the first character of the command is '-' it is a cron option. */ while ((ch = get_char(file)) == '-') { switch (ch = get_char(file)) { case 'q': e->flags |= DONT_LOG; Skip_Nonblanks(ch, file); break; default: ecode = e_option; goto eof; } Skip_Blanks(ch, file); if (ch == EOF || ch == '\n') { ecode = e_cmd; goto eof; } } unget_char(ch, file); /* Everything up to the next \n or EOF is part of the command... * too bad we don't know in advance how long it will be, since we * need to malloc a string for it... so, we limit it to MAX_COMMAND. */ ch = get_string(cmd, MAX_COMMAND, file, "\n"); /* a file without a \n before the EOF is rude, so we'll complain... */ if (ch == EOF) { ecode = e_cmd; goto eof; } /* got the command in the 'cmd' string; save it in *e. */ if ((e->cmd = strdup(cmd)) == NULL) { ecode = e_memory; goto eof; } Debug(DPARS, ("load_entry()...returning successfully\n")); /* success, fini, return pointer to the entry we just created... */ return (e); eof: if (e->envp) env_free(e->envp); if (e->pwd) free(e->pwd); if (e->cmd) free(e->cmd); free(e); while (ch != '\n' && !feof(file)) ch = get_char(file); if (ecode != e_none && error_func) (*error_func)(ecodes[(int)ecode]); return (NULL); }
static int _handle_completion(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int first; int last; jobacctinfo_t *jobacct = NULL; int step_rc; char* buf; int len; Buf buffer; int version; /* For future use */ bool lock_set = false; debug("_handle_completion for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("step completion message from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; } safe_read(fd, &version, sizeof(int)); safe_read(fd, &first, sizeof(int)); safe_read(fd, &last, sizeof(int)); safe_read(fd, &step_rc, sizeof(int)); /* * We must not use getinfo over a pipe with slurmd here * Indeed, slurmstepd does a large use of setinfo over a pipe * with slurmd and doing the reverse can result in a deadlock * scenario with slurmd : * slurmd(lockforread,write)/slurmstepd(write,lockforread) * Do pack/unpack instead to be sure of independances of * slurmd and slurmstepd */ safe_read(fd, &len, sizeof(int)); buf = xmalloc(len); safe_read(fd, buf, len); buffer = create_buf(buf, len); jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION, PROTOCOL_TYPE_SLURM, buffer, 1); free_buf(buffer); /* * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); lock_set = true; if (! step_complete.wait_children) { rc = -1; errnum = ETIMEDOUT; /* not used anyway */ goto timeout; } /* SlurmUser or root can craft a launch without a valid credential * ("srun --no-alloc ...") and no tree information can be built * without the hostlist from the credential. */ if (step_complete.rank >= 0) { #if 0 char bits_string[128]; debug2("Setting range %d (bit %d) through %d(bit %d)", first, first-(step_complete.rank+1), last, last-(step_complete.rank+1)); bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" before bits: %s", bits_string); #endif bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); #if 0 bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" after bits: %s", bits_string); #endif } step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ jobacctinfo_aggregate(step_complete.jobacct, jobacct); timeout: jobacctinfo_destroy(jobacct); /*********************************************/ /* Send the return code and errno, we do this within the locked * region to ensure that the stepd doesn't exit before we can * perform this send. */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); return SLURM_SUCCESS; rwfail: if (lock_set) { pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); } return SLURM_FAILURE; }
extern int handle_small_record_request(List records, select_ba_request_t *blockreq, bg_record_t *bg_record, bitoff_t start) { bitstr_t *ionodes = bit_alloc(bg_conf->ionodes_per_mp); int i=0, ionode_cnt = 0; bg_record_t *found_record = NULL; xassert(records); xassert(blockreq); xassert(bg_record); xassert(start >= 0); xassert(start < bg_conf->ionodes_per_mp); #ifndef HAVE_BGL for(i=0; i<blockreq->small16; i++) { bit_nset(ionodes, start, start); found_record = create_small_record(bg_record, ionodes, 16); /* this needs to be an append so we keep things in the order we got them, they will be sorted later */ list_append(records, found_record); bit_nclear(ionodes, start, start); start++; } #endif if ((ionode_cnt = bg_conf->nodecard_ionode_cnt)) ionode_cnt--; for(i=0; i<blockreq->small32; i++) { bit_nset(ionodes, start, start+ionode_cnt); found_record = create_small_record(bg_record, ionodes, 32); /* this needs to be an append so we keep things in the order we got them, they will be sorted later */ list_append(records, found_record); bit_nclear(ionodes, start, start+ionode_cnt); start+=ionode_cnt+1; } #ifndef HAVE_BGL if ((ionode_cnt = bg_conf->nodecard_ionode_cnt * 2)) ionode_cnt--; for(i=0; i<blockreq->small64; i++) { bit_nset(ionodes, start, start+ionode_cnt); found_record = create_small_record(bg_record, ionodes, 64); /* this needs to be an append so we keep things in the order we got them, they will be sorted later */ list_append(records, found_record); bit_nclear(ionodes, start, start+ionode_cnt); start+=ionode_cnt+1; } #endif if ((ionode_cnt = bg_conf->quarter_ionode_cnt)) ionode_cnt--; for(i=0; i<blockreq->small128; i++) { bit_nset(ionodes, start, start+ionode_cnt); found_record = create_small_record(bg_record, ionodes, 128); /* this needs to be an append so we keep things in the order we got them, they will be sorted later */ list_append(records, found_record); bit_nclear(ionodes, start, start+ionode_cnt); start+=ionode_cnt+1; } #ifndef HAVE_BGL if ((ionode_cnt = bg_conf->quarter_ionode_cnt * 2)) ionode_cnt--; for(i=0; i<blockreq->small256; i++) { bit_nset(ionodes, start, start+ionode_cnt); found_record = create_small_record(bg_record, ionodes, 256); /* this needs to be an append so we keep things in the order we got them, they will be sorted later */ list_append(records, found_record); bit_nclear(ionodes, start, start+ionode_cnt); start+=ionode_cnt+1; } #endif FREE_NULL_BITMAP(ionodes); return SLURM_SUCCESS; }
extern int get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, uint16_t *p_sockets, uint16_t *p_cores, uint16_t *p_threads, uint16_t *p_block_map_size, uint16_t **p_block_map, uint16_t **p_block_map_inv) { enum { SOCKET=0, CORE=1, PU=2, LAST_OBJ=3 }; hwloc_topology_t topology; hwloc_obj_t obj; hwloc_obj_type_t objtype[LAST_OBJ]; unsigned idx[LAST_OBJ]; int nobj[LAST_OBJ]; bitstr_t *used_socket = NULL; int *cores_per_socket; int actual_cpus; int macid; int absid; int actual_boards = 1, depth, sock_cnt, tot_socks = 0; int i, used_core_idx, used_sock_idx; debug2("hwloc_topology_init"); if (hwloc_topology_init(&topology)) { /* error in initialize hwloc library */ debug("hwloc_topology_init() failed."); return 1; } /* parse all system */ hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM); /* ignores cache, misc */ #if HWLOC_API_VERSION < 0x00020000 hwloc_topology_ignore_type(topology, HWLOC_OBJ_CACHE); hwloc_topology_ignore_type(topology, HWLOC_OBJ_MISC); #else hwloc_topology_set_type_filter(topology, HWLOC_OBJ_L1CACHE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter(topology, HWLOC_OBJ_L2CACHE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter(topology, HWLOC_OBJ_L3CACHE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter(topology, HWLOC_OBJ_L4CACHE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter(topology, HWLOC_OBJ_L5CACHE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter(topology, HWLOC_OBJ_MISC, HWLOC_TYPE_FILTER_KEEP_NONE); #endif /* load topology */ debug2("hwloc_topology_load"); if (hwloc_topology_load(topology)) { /* error in load hardware topology */ debug("hwloc_topology_load() failed."); hwloc_topology_destroy(topology); return 2; } #if _DEBUG _hwloc_children(topology, hwloc_get_root_obj(topology), 0); #endif /* * Some processors (e.g. AMD Opteron 6000 series) contain multiple * NUMA nodes per socket. This is a configuration which does not map * into the hardware entities that Slurm optimizes resource allocation * for (PU/thread, core, socket, baseboard, node and network switch). * In order to optimize resource allocations on such hardware, Slurm * will consider each NUMA node within the socket as a separate socket. * You can disable this configuring "SchedulerParameters=Ignore_NUMA", * in which case Slurm will report the correct socket count on the node, * but not be able to optimize resource allocations on the NUMA nodes. */ objtype[SOCKET] = HWLOC_OBJ_SOCKET; objtype[CORE] = HWLOC_OBJ_CORE; objtype[PU] = HWLOC_OBJ_PU; if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) { char *sched_params = slurm_get_sched_params(); if (sched_params && strcasestr(sched_params, "Ignore_NUMA")) { info("Ignoring NUMA nodes within a socket"); } else { info("Considering each NUMA node as a socket"); objtype[SOCKET] = HWLOC_OBJ_NODE; } xfree(sched_params); } /* number of objects */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { actual_boards = MAX(hwloc_get_nbobjs_by_depth(topology, depth), 1); } /* * Count sockets/NUMA containing any cores. * KNL NUMA with no cores are NOT counted. */ nobj[SOCKET] = 0; depth = hwloc_get_type_depth(topology, objtype[SOCKET]); used_socket = bit_alloc(_MAX_SOCKET_INX); cores_per_socket = xmalloc(sizeof(int) * _MAX_SOCKET_INX); sock_cnt = hwloc_get_nbobjs_by_depth(topology, depth); for (i = 0; i < sock_cnt; i++) { obj = hwloc_get_obj_by_depth(topology, depth, i); if (obj->type == objtype[SOCKET]) { cores_per_socket[i] = _core_child_count(topology, obj); if (cores_per_socket[i] > 0) { nobj[SOCKET]++; bit_set(used_socket, tot_socks); } if (++tot_socks >= _MAX_SOCKET_INX) { /* Bitmap size */ fatal("Socket count exceeds %d, expand data structure size", _MAX_SOCKET_INX); break; } } } nobj[CORE] = hwloc_get_nbobjs_by_type(topology, objtype[CORE]); /* * Workaround for hwloc bug, in some cases the topology "children" array * does not get populated, so _core_child_count() always returns 0 */ if (nobj[SOCKET] == 0) { nobj[SOCKET] = hwloc_get_nbobjs_by_type(topology, objtype[SOCKET]); if (nobj[SOCKET] == 0) { debug("get_cpuinfo() fudging nobj[SOCKET] from 0 to 1"); nobj[SOCKET] = 1; } if (nobj[SOCKET] >= _MAX_SOCKET_INX) { /* Bitmap size */ fatal("Socket count exceeds %d, expand data structure size", _MAX_SOCKET_INX); } bit_nset(used_socket, 0, nobj[SOCKET] - 1); } /* * Workaround for hwloc * hwloc_get_nbobjs_by_type() returns 0 on some architectures. */ if ( nobj[CORE] == 0 ) { debug("get_cpuinfo() fudging nobj[CORE] from 0 to 1"); nobj[CORE] = 1; } if ( nobj[SOCKET] == -1 ) fatal("get_cpuinfo() can not handle nobj[SOCKET] = -1"); if ( nobj[CORE] == -1 ) fatal("get_cpuinfo() can not handle nobj[CORE] = -1"); actual_cpus = hwloc_get_nbobjs_by_type(topology, objtype[PU]); #if 0 /* Used to find workaround above */ info("CORE = %d SOCKET = %d actual_cpus = %d nobj[CORE] = %d", CORE, SOCKET, actual_cpus, nobj[CORE]); #endif if ((actual_cpus % nobj[CORE]) != 0) { error("Thread count (%d) not multiple of core count (%d)", actual_cpus, nobj[CORE]); } nobj[PU] = actual_cpus / nobj[CORE]; /* threads per core */ if ((nobj[CORE] % nobj[SOCKET]) != 0) { error("Core count (%d) not multiple of socket count (%d)", nobj[CORE], nobj[SOCKET]); } nobj[CORE] /= nobj[SOCKET]; /* cores per socket */ debug("CPUs:%d Boards:%d Sockets:%d CoresPerSocket:%d ThreadsPerCore:%d", actual_cpus, actual_boards, nobj[SOCKET], nobj[CORE], nobj[PU]); /* allocate block_map */ if (p_block_map_size) *p_block_map_size = (uint16_t)actual_cpus; if (p_block_map && p_block_map_inv) { *p_block_map = xmalloc(actual_cpus * sizeof(uint16_t)); *p_block_map_inv = xmalloc(actual_cpus * sizeof(uint16_t)); /* initialize default as linear mapping */ for (i = 0; i < actual_cpus; i++) { (*p_block_map)[i] = i; (*p_block_map_inv)[i] = i; } /* create map with hwloc */ used_sock_idx = -1; used_core_idx = -1; for (idx[SOCKET] = 0; (used_sock_idx + 1) < nobj[SOCKET]; idx[SOCKET]++) { if (!bit_test(used_socket, idx[SOCKET])) continue; used_sock_idx++; for (idx[CORE] = 0; idx[CORE] < cores_per_socket[idx[SOCKET]]; idx[CORE]++) { used_core_idx++; for (idx[PU]=0; idx[PU]<nobj[PU]; ++idx[PU]) { /* get hwloc_obj by indexes */ obj=hwloc_get_obj_below_array_by_type( topology, 3, objtype, idx); if (!obj) continue; macid = obj->os_index; absid = used_core_idx * nobj[PU] + idx[PU]; if ((macid >= actual_cpus) || (absid >= actual_cpus)) { /* physical or logical ID are * out of range */ continue; } debug4("CPU map[%d]=>%d S:C:T %d:%d:%d", absid, macid, used_sock_idx, idx[CORE], idx[PU]); (*p_block_map)[absid] = macid; (*p_block_map_inv)[macid] = absid; } } } } FREE_NULL_BITMAP(used_socket); xfree(cores_per_socket); hwloc_topology_destroy(topology); /* update output parameters */ *p_cpus = actual_cpus; *p_boards = actual_boards; *p_sockets = nobj[SOCKET]; *p_cores = nobj[CORE]; *p_threads = nobj[PU]; #if _DEBUG /*** Display raw data ***/ debug("CPUs:%u Boards:%u Sockets:%u CoresPerSocket:%u ThreadsPerCore:%u", *p_cpus, *p_boards, *p_sockets, *p_cores, *p_threads); /* Display the mapping tables */ if (p_block_map && p_block_map_inv) { debug("------"); debug("Abstract -> Machine logical CPU ID block mapping:"); debug("AbstractId PhysicalId Inverse"); for (i = 0; i < *p_cpus; i++) { debug3(" %4d %4u %4u", i, (*p_block_map)[i], (*p_block_map_inv)[i]); } debug("------"); } #endif return SLURM_SUCCESS; }
/* * Read and process the bluegene.conf configuration file so to interpret what * blocks are static/dynamic, torus/mesh, etc. */ extern int read_bg_conf(void) { int i; int count = 0; s_p_hashtbl_t *tbl = NULL; char *layout = NULL; select_ba_request_t **blockreq_array = NULL; image_t **image_array = NULL; image_t *image = NULL; static time_t last_config_update = (time_t) 0; struct stat config_stat; ListIterator itr = NULL; char* bg_conf_file = NULL; static int *dims = NULL; if (!dims) dims = select_g_ba_get_dims(); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Reading the bluegene.conf file"); /* check if config file has changed */ bg_conf_file = _get_bg_conf(); if (stat(bg_conf_file, &config_stat) < 0) fatal("can't stat bluegene.conf file %s: %m", bg_conf_file); if (last_config_update) { _reopen_bridge_log(); if (last_config_update == config_stat.st_mtime) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s unchanged", bg_conf_file); } else { info("Restart slurmctld for %s changes " "to take effect", bg_conf_file); } last_config_update = config_stat.st_mtime; xfree(bg_conf_file); return SLURM_SUCCESS; } last_config_update = config_stat.st_mtime; /* initialization */ /* bg_conf defined in bg_node_alloc.h */ tbl = s_p_hashtbl_create(bg_conf_file_options); if (s_p_parse_file(tbl, NULL, bg_conf_file, false) == SLURM_ERROR) fatal("something wrong with opening/reading bluegene " "conf file"); xfree(bg_conf_file); #ifdef HAVE_BGL if (s_p_get_array((void ***)&image_array, &count, "AltBlrtsImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->blrts_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_blrtsimage, "BlrtsImage", tbl)) { if (!list_count(bg_conf->blrts_list)) fatal("BlrtsImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->blrts_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_blrtsimage = xstrdup(image->name); info("Warning: using %s as the default BlrtsImage. " "If this isn't correct please set BlrtsImage", bg_conf->default_blrtsimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default BlrtsImage %s", bg_conf->default_blrtsimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_blrtsimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->blrts_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltLinuxImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "LinuxImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("LinuxImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default LinuxImage. " "If this isn't correct please set LinuxImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default LinuxImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltRamDiskImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "RamDiskImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("RamDiskImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default RamDiskImage. " "If this isn't correct please set RamDiskImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default RamDiskImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #elif defined HAVE_BGP if (s_p_get_array((void ***)&image_array, &count, "AltCnloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "CnloadImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("CnloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default CnloadImage. " "If this isn't correct please set CnloadImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default CnloadImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltIoloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "IoloadImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("IoloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default IoloadImage. " "If this isn't correct please set IoloadImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default IoloadImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #endif if (s_p_get_array((void ***)&image_array, &count, "AltMloaderImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->mloader_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_mloaderimage, "MloaderImage", tbl)) { if (!list_count(bg_conf->mloader_list)) fatal("MloaderImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->mloader_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_mloaderimage = xstrdup(image->name); info("Warning: using %s as the default MloaderImage. " "If this isn't correct please set MloaderImage", bg_conf->default_mloaderimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default MloaderImage %s", bg_conf->default_mloaderimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_mloaderimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->mloader_list, image); } if (!s_p_get_uint16( &bg_conf->mp_cnode_cnt, "BasePartitionNodeCnt", tbl)) { error("BasePartitionNodeCnt not configured in bluegene.conf " "defaulting to 512 as BasePartitionNodeCnt"); bg_conf->mp_cnode_cnt = 512; bg_conf->quarter_cnode_cnt = 128; } else { if (bg_conf->mp_cnode_cnt <= 0) fatal("You should have more than 0 nodes " "per base partition"); bg_conf->quarter_cnode_cnt = bg_conf->mp_cnode_cnt/4; } /* bg_conf->cpus_per_mp should had already been set from the * node_init */ if (bg_conf->cpus_per_mp < bg_conf->mp_cnode_cnt) { fatal("For some reason we have only %u cpus per mp, but " "have %u cnodes per mp. You need at least the same " "number of cpus as you have cnodes per mp. " "Check the NodeName Procs= " "definition in the slurm.conf.", bg_conf->cpus_per_mp, bg_conf->mp_cnode_cnt); } bg_conf->cpu_ratio = bg_conf->cpus_per_mp/bg_conf->mp_cnode_cnt; if (!bg_conf->cpu_ratio) fatal("We appear to have less than 1 cpu on a cnode. " "You specified %u for BasePartitionNodeCnt " "in the blugene.conf and %u cpus " "for each node in the slurm.conf", bg_conf->mp_cnode_cnt, bg_conf->cpus_per_mp); num_unused_cpus = 1; for (i = 0; i<SYSTEM_DIMENSIONS; i++) num_unused_cpus *= dims[i]; num_unused_cpus *= bg_conf->cpus_per_mp; if (!s_p_get_uint16( &bg_conf->nodecard_cnode_cnt, "NodeCardNodeCnt", tbl)) { error("NodeCardNodeCnt not configured in bluegene.conf " "defaulting to 32 as NodeCardNodeCnt"); bg_conf->nodecard_cnode_cnt = 32; } if (bg_conf->nodecard_cnode_cnt<=0) fatal("You should have more than 0 nodes per nodecard"); bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt / bg_conf->nodecard_cnode_cnt; if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "Numpsets", tbl)) fatal("Warning: Numpsets not configured in bluegene.conf"); if (!bg_conf->ionodes_per_mp) { if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "IONodesPerMP", tbl)) fatal("Warning: IONodesPerMP not configured " "in bluegene.conf"); } #ifdef HAVE_BGQ /* You can only have 16 ionodes per midplane */ if (bg_conf->ionodes_per_mp > bg_conf->mp_nodecard_cnt) bg_conf->ionodes_per_mp = bg_conf->mp_nodecard_cnt; #endif if (bg_conf->ionodes_per_mp) { bitstr_t *tmp_bitmap = NULL; int small_size = 1; /* THIS IS A HACK TO MAKE A 1 NODECARD SYSTEM WORK */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) { bg_conf->quarter_ionode_cnt = 2; bg_conf->nodecard_ionode_cnt = 2; } else { bg_conf->quarter_ionode_cnt = bg_conf->ionodes_per_mp/4; bg_conf->nodecard_ionode_cnt = bg_conf->quarter_ionode_cnt/4; } /* How many nodecards per ionode */ bg_conf->nc_ratio = ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt) / (double)bg_conf->ionodes_per_mp; /* How many ionodes per nodecard */ bg_conf->io_ratio = (double)bg_conf->ionodes_per_mp / ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt); //info("got %f %f", bg_conf->nc_ratio, bg_conf->io_ratio); /* figure out the smallest block we can have on the system */ #ifdef HAVE_BGL if (bg_conf->io_ratio >= 1) bg_conf->smallest_block=32; else bg_conf->smallest_block=128; #else if (bg_conf->io_ratio >= 2) bg_conf->smallest_block=16; else if (bg_conf->io_ratio == 1) bg_conf->smallest_block=32; else if (bg_conf->io_ratio == .5) bg_conf->smallest_block=64; else if (bg_conf->io_ratio == .25) bg_conf->smallest_block=128; else if (bg_conf->io_ratio == .125) bg_conf->smallest_block=256; else { error("unknown ioratio %f. Can't figure out " "smallest block size, setting it to midplane", bg_conf->io_ratio); bg_conf->smallest_block = 512; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Smallest block possible on this system is %u", bg_conf->smallest_block); /* below we are creating all the possible bitmaps for * each size of small block */ if ((int)bg_conf->nodecard_ionode_cnt < 1) { bg_conf->nodecard_ionode_cnt = 0; } else { bg_lists->valid_small32 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small32, tmp_bitmap); } } /* If we only have 1 nodecard just jump to the end since this will never need to happen below. Pretty much a hack to avoid seg fault;). */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) goto no_calc; bg_lists->valid_small128 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small128, tmp_bitmap); } #ifndef HAVE_BGL bg_lists->valid_small64 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small64, tmp_bitmap); } bg_lists->valid_small256 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small256, tmp_bitmap); } #endif } else { fatal("your ionodes_per_mp is 0"); } no_calc: if (!s_p_get_uint16(&bg_conf->bridge_api_verb, "BridgeAPIVerbose", tbl)) info("Warning: BridgeAPIVerbose not configured " "in bluegene.conf"); if (!s_p_get_string(&bg_conf->bridge_api_file, "BridgeAPILogFile", tbl)) info("BridgeAPILogFile not configured in bluegene.conf"); else _reopen_bridge_log(); if (s_p_get_string(&layout, "DenyPassthrough", tbl)) { if (strstr(layout, "A")) ba_deny_pass |= PASS_DENY_A; if (strstr(layout, "X")) ba_deny_pass |= PASS_DENY_X; if (strstr(layout, "Y")) ba_deny_pass |= PASS_DENY_Y; if (strstr(layout, "Z")) ba_deny_pass |= PASS_DENY_Z; if (!strcasecmp(layout, "ALL")) ba_deny_pass |= PASS_DENY_ALL; bg_conf->deny_pass = ba_deny_pass; xfree(layout); } if (!s_p_get_string(&layout, "LayoutMode", tbl)) { info("Warning: LayoutMode was not specified in bluegene.conf " "defaulting to STATIC partitioning"); bg_conf->layout_mode = LAYOUT_STATIC; } else { if (!strcasecmp(layout,"STATIC")) bg_conf->layout_mode = LAYOUT_STATIC; else if (!strcasecmp(layout,"OVERLAP")) bg_conf->layout_mode = LAYOUT_OVERLAP; else if (!strcasecmp(layout,"DYNAMIC")) bg_conf->layout_mode = LAYOUT_DYNAMIC; else { fatal("I don't understand this LayoutMode = %s", layout); } xfree(layout); } /* add blocks defined in file */ if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { if (!s_p_get_array((void ***)&blockreq_array, &count, "BPs", tbl)) { info("WARNING: no blocks defined in bluegene.conf, " "only making full system block"); /* create_full_system_block(NULL); */ } for (i = 0; i < count; i++) { add_bg_record(bg_lists->main, NULL, blockreq_array[i], 0, 0); } } s_p_hashtbl_destroy(tbl); return SLURM_SUCCESS; }
int main(int argc, char *argv[]) { note("Testing static decl"); { bitstr_t bit_decl(bs, 65); /*bitstr_t *bsp = bs;*/ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set"); TEST(bit_test(bs,14), "bit 14 set" ); /*bit_free(bsp);*/ /* triggers TEST in bit_free - OK */ } note("Testing basic vixie functions"); { bitstr_t *bs = bit_alloc(16), *bs2; /*bit_set(bs, 42);*/ /* triggers TEST in bit_set - OK */ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set" ); TEST(bit_test(bs,14), "bit 14 set"); bs2 = bit_copy(bs); bit_fill_gaps(bs2); TEST(bit_ffs(bs2) == 9, "first bit set = 9 "); TEST(bit_fls(bs2) == 14, "last bit set = 14"); TEST(bit_set_count(bs2) == 6, "bitstring"); TEST(bit_test(bs2,12), "bitstring"); TEST(bit_super_set(bs,bs2) == 1, "bitstring"); TEST(bit_super_set(bs2,bs) == 0, "bitstring"); bit_clear(bs,14); TEST(!bit_test(bs,14), "bitstring"); bit_nclear(bs,9,14); TEST(!bit_test(bs,9), "bitstring"); TEST(!bit_test(bs,12), "bitstring"); TEST(!bit_test(bs,14), "bitstring"); bit_nset(bs,9,14); TEST(bit_test(bs,9), "bitstring"); TEST(bit_test(bs,12), "bitstring"); TEST(bit_test(bs,14), "bitstring"); TEST(bit_ffs(bs) == 9, "ffs"); TEST(bit_ffc(bs) == 0, "ffc"); bit_nset(bs,0,8); TEST(bit_ffc(bs) == 15, "ffc"); bit_free(bs); /*bit_set(bs,9); */ /* triggers TEST in bit_set - OK */ } note("Testing and/or/not"); { bitstr_t *bs1 = bit_alloc(128); bitstr_t *bs2 = bit_alloc(128); bit_set(bs1, 100); bit_set(bs1, 104); bit_set(bs2, 100); bit_and(bs1, bs2); TEST(bit_test(bs1, 100), "and"); TEST(!bit_test(bs1, 104), "and"); bit_set(bs2, 110); bit_set(bs2, 111); bit_set(bs2, 112); bit_or(bs1, bs2); TEST(bit_test(bs1, 100), "or"); TEST(bit_test(bs1, 110), "or"); TEST(bit_test(bs1, 111), "or"); TEST(bit_test(bs1, 112), "or"); bit_not(bs1); TEST(!bit_test(bs1, 100), "not"); TEST(bit_test(bs1, 12), "not"); bit_free(bs1); bit_free(bs2); } note("testing bit selection"); { bitstr_t *bs1 = bit_alloc(128), *bs2; bit_set(bs1, 21); bit_set(bs1, 100); bit_fill_gaps(bs1); bs2 = bit_pick_cnt(bs1,20); if (bs2) { TEST(bit_set_count(bs2) == 20, "pick"); TEST(bit_ffs(bs2) == 21, "pick"); TEST(bit_fls(bs2) == 40, "pick"); bit_free(bs2); } else TEST(0, "alloc fail"); bit_free(bs1); } note("Testing realloc"); { bitstr_t *bs = bit_alloc(1); TEST(bit_ffs(bs) == -1, "bitstring"); bit_set(bs,0); /*bit_set(bs, 1000);*/ /* triggers TEST in bit_set - OK */ bs = bit_realloc(bs,1048576); bit_set(bs,1000); bit_set(bs,1048575); TEST(bit_test(bs, 0), "bitstring"); TEST(bit_test(bs, 1000), "bitstring"); TEST(bit_test(bs, 1048575), "bitstring"); TEST(bit_set_count(bs) == 3, "bitstring"); bit_clear(bs,0); bit_clear(bs,1000); TEST(bit_set_count(bs) == 1, "bitstring"); TEST(bit_ffs(bs) == 1048575, "bitstring"); bit_free(bs); } note("Testing bit_fmt"); { char tmpstr[1024]; bitstr_t *bs = bit_alloc(1024); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), ""), "bitstring"); bit_set(bs,42); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42"), "bitstring"); bit_set(bs,102); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42,102"), "bitstring"); bit_nset(bs,9,14); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr), bs), "9-14,42,102"), "bitstring"); } note("Testing bit_nffc/bit_nffs"); { bitstr_t *bs = bit_alloc(1024); bit_set(bs, 2); bit_set(bs, 6); bit_set(bs, 7); bit_nset(bs,12,1018); TEST(bit_nffc(bs, 2) == 0, "bitstring"); TEST(bit_nffc(bs, 3) == 3, "bitstring"); TEST(bit_nffc(bs, 4) == 8, "bitstring"); TEST(bit_nffc(bs, 5) == 1019, "bitstring"); TEST(bit_nffc(bs, 6) == -1, "bitstring"); TEST(bit_nffs(bs, 1) == 2, "bitstring"); TEST(bit_nffs(bs, 2) == 6, "bitstring"); TEST(bit_nffs(bs, 100) == 12, "bitstring"); TEST(bit_nffs(bs, 1023) == -1, "bitstring"); bit_free(bs); } note("Testing bit_unfmt"); { bitstr_t *bs = bit_alloc(1024); bitstr_t *bs2 = bit_alloc(1024); char tmpstr[4096]; bit_set(bs,1); bit_set(bs,3); bit_set(bs,30); bit_nset(bs,42,64); bit_nset(bs,97,1000); bit_fmt(tmpstr, sizeof(tmpstr), bs); TEST(bit_unfmt(bs2, tmpstr) != -1, "bitstring"); TEST(bit_equal(bs, bs2), "bitstring"); } totals(); return failed; }
/* * main - slurmctld main function, start various threads and process RPCs * test7.17.prog <TRES_PER_NODE> <CONFIG_DIR_HEAD> <CONFIG_SUB_DIR> <CPU_COUNT> * */ int main(int argc, char *argv[]) { log_options_t opts = LOG_OPTS_STDERR_ONLY; int rc; uint32_t cpu_count, cpu_alloc, job_id = 12345; char *node_name, *reason_down = NULL; char *orig_config, *new_config = NULL, *tres_per_node = NULL; Buf buffer; List job_gres_list = NULL, node_gres_list = NULL; bitstr_t *cpu_bitmap; char config_dir[10000], test[1000]; char slurm_conf[1000]; uint32_t num_tasks = 1; uint32_t min_nodes = 1; uint32_t max_nodes = 1; uint16_t ntasks_per_node = NO_VAL16; uint16_t ntasks_per_socket = NO_VAL16; uint16_t sockets_per_node = NO_VAL16; uint16_t cpus_per_task = NO_VAL16; int core_count, sock_count; /* Setup slurm.conf and gres.conf test paths */ strcpy(config_dir, argv[2]); strcpy(config_dir,strcat(config_dir, "/test7.17_configs")); strcpy(test, strcat(config_dir, argv[3])); strcpy(slurm_conf, strcat(test, "/slurm.conf")); /* Enable detailed logging for now */ opts.stderr_level = LOG_LEVEL_DEBUG; log_init(argv[0], opts, SYSLOG_FACILITY_USER, NULL); /* * Logic normally executed by slurmd daemon */ setenv("SLURM_CONF", slurm_conf, 1); rc = gres_plugin_init(); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_init"); exit(1); } setenv("SLURM_CONFIG_DIR", config_dir, 1); cpu_count = strtol(argv[4], NULL, 10); node_name = "test_node"; rc = gres_plugin_node_config_load(cpu_count, node_name, NULL, NULL, NULL); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_load"); exit(1); } buffer = init_buf(1024); rc = gres_plugin_node_config_pack(buffer); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_pack"); exit(1); } /* * Logic normally executed by slurmctld daemon */ orig_config = "gpu:8"; rc = gres_plugin_init_node_config(node_name, orig_config, &node_gres_list); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_init_node_config"); exit(1); } set_buf_offset(buffer, 0); rc = gres_plugin_node_config_unpack(buffer, node_name); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_unpack"); exit(1); } core_count = cpu_count; sock_count = 1; rc = gres_plugin_node_config_validate(node_name, orig_config, &new_config, &node_gres_list, cpu_count, core_count, sock_count, 0, &reason_down); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_validate"); exit(1); } if (argc > 2) tres_per_node = xstrdup(argv[1]); rc = gres_plugin_job_state_validate(NULL, /* cpus_per_tres */ NULL, /* tres_freq */ NULL, /* tres_per_job */ tres_per_node, NULL, /* tres_per_socket */ NULL, /* tres_per_task */ NULL, /* mem_per_tres */ &num_tasks, &min_nodes, &max_nodes, &ntasks_per_node, &ntasks_per_socket, &sockets_per_node, &cpus_per_task, &job_gres_list); if (rc != SLURM_SUCCESS) { slurm_seterrno(rc); slurm_perror("failure: gres_plugin_job_state_validate"); exit(1); } gres_plugin_node_state_log(node_gres_list, node_name); gres_plugin_job_state_log(job_gres_list, job_id); cpu_bitmap = bit_alloc(cpu_count); bit_nset(cpu_bitmap, 0, cpu_count - 1); cpu_alloc = gres_plugin_job_test(job_gres_list, node_gres_list, true, cpu_bitmap, 0, cpu_count - 1, job_id, node_name); if (cpu_alloc == NO_VAL) printf("cpu_alloc=ALL\n"); else printf("cpu_alloc=%u\n", cpu_alloc); rc = gres_plugin_fini(); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_fini"); exit(1); } printf("Test %s ran to completion\n\n", argv[3]); exit(0); }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we * will call the functions without locking the locks again. */ extern int down_nodecard(char *mp_name, bitoff_t io_start, bool slurmctld_locked) { List requests = NULL; List delete_list = NULL; ListIterator itr = NULL; bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; static int io_cnt = NO_VAL; static int create_size = NO_VAL; static select_ba_request_t blockreq; int rc = SLURM_SUCCESS; char *reason = "select_bluegene: nodecard down"; xassert(mp_name); if (io_cnt == NO_VAL) { io_cnt = 1; /* Translate 1 nodecard count to ionode count */ if ((io_cnt *= bg_conf->io_ratio)) io_cnt--; /* make sure we create something that is able to be created */ if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt) create_size = bg_conf->nodecard_cnode_cnt; else create_size = bg_conf->smallest_block; } node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified '%s'", mp_name); return EINVAL; } /* this is here for sanity check to make sure we don't core on these bits when we set them below. */ if (io_start >= bg_conf->ionodes_per_mp || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) { debug("io %d-%d not configured on this " "system, only %d ionodes per midplane", io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.conn_type[0] = SELECT_SMALL; blockreq.save_name = mp_name; debug3("here setting node %d of %d and ionodes %d-%d of %d", mp_bit, node_record_count, io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); memset(&tmp_record, 0, sizeof(bg_record_t)); tmp_record.mp_count = 1; tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt; tmp_record.mp_bitmap = bit_alloc(node_record_count); bit_set(tmp_record.mp_bitmap, mp_bit); tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!blocks_overlap(bg_record, &tmp_record)) continue; if (bg_record->job_running > NO_JOB_RUNNING) { if (slurmctld_locked) job_fail(bg_record->job_running); else slurm_fail_job(bg_record->job_running); } /* If Running Dynamic mode and the block is smaller than the create size just continue on. */ if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) && (bg_record->cnode_cnt < create_size)) { if (!delete_list) delete_list = list_create(NULL); list_append(delete_list, bg_record); continue; } /* keep track of the smallest size that is at least the size of create_size. */ if (!smallest_bg_record || (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt)) smallest_bg_record = bg_record; } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { debug3("running non-dynamic mode"); /* This should never happen, but just in case... */ if (delete_list) list_destroy(delete_list); /* If we found a block that is smaller or equal to a midplane we will just mark it in an error state as opposed to draining the node. */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug("No block under 1 midplane available for this nodecard. " "Draining the whole node."); if (!node_already_down(mp_name)) { if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; } /* below is only for Dynamic mode */ if (delete_list) { int cnt_set = 0; bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp); /* don't lock here since it is handled inside the put_block_in_error_state */ itr = list_iterator_create(delete_list); while ((bg_record = list_next(itr))) { debug2("combining smaller than nodecard " "dynamic block %s", bg_record->bg_block_id); while (bg_record->job_running > NO_JOB_RUNNING) sleep(1); bit_or(iobitmap, bg_record->ionode_bitmap); cnt_set++; } list_iterator_destroy(itr); list_destroy(delete_list); if (!cnt_set) { FREE_NULL_BITMAP(iobitmap); rc = SLURM_ERROR; goto cleanup; } /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ if ((io_start = bit_ffs(iobitmap)) == -1) { io_start = 0; if (create_size > bg_conf->nodecard_cnode_cnt) blockreq.small128 = 4; else blockreq.small32 = 16; } else if (create_size <= bg_conf->nodecard_cnode_cnt) blockreq.small32 = 1; else /* this should never happen */ blockreq.small128 = 1; FREE_NULL_BITMAP(iobitmap); } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } while (smallest_bg_record->job_running > NO_JOB_RUNNING) sleep(1); if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } if (create_size > smallest_bg_record->cnode_cnt) { /* we should never get here. This means we * have a create_size that is bigger than a * block that is already made. */ rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug3("node count is %d", smallest_bg_record->cnode_cnt); switch(smallest_bg_record->cnode_cnt) { #ifndef HAVE_BGL case 64: blockreq.small32 = 2; break; case 256: blockreq.small32 = 8; break; #endif case 128: blockreq.small32 = 4; break; case 512: default: blockreq.small32 = 16; break; } if (create_size != bg_conf->nodecard_cnode_cnt) { blockreq.small128 = blockreq.small32 / 4; blockreq.small32 = 0; io_start = 0; } else if ((io_start = bit_ffs(smallest_bg_record->ionode_bitmap)) == -1) /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ io_start = 0; } else { switch(create_size) { #ifndef HAVE_BGL case 64: blockreq.small64 = 8; break; case 256: blockreq.small256 = 2; #endif case 32: blockreq.small32 = 16; break; case 128: blockreq.small128 = 4; break; case 512: if (!node_already_down(mp_name)) { char *reason = "select_bluegene: nodecard down"; if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes( mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; break; default: error("Unknown create size of %d", create_size); break; } /* since we don't have a block in this midplane we need to start at the beginning. */ io_start = 0; /* we also need a bg_block to pretend to be the smallest block that takes up the entire midplane. */ } /* Here we need to add blocks that take up nodecards on this midplane. Since Slurm only keeps track of midplanes natively this is the only want to handle this case. */ requests = list_create(destroy_bg_record); add_bg_record(requests, NULL, &blockreq, 1, io_start); slurm_mutex_lock(&block_state_mutex); delete_list = list_create(NULL); while ((bg_record = list_pop(requests))) { itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); list_remove(itr); } list_iterator_destroy(itr); /* we need to add this record since it doesn't exist */ if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("down_sub_node_blocks: " "unable to configure block in api"); continue; } debug("adding block %s to fill in small blocks " "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); list_append(bg_lists->main, bg_record); if (bit_overlap(bg_record->ionode_bitmap, tmp_record.ionode_bitmap)) { /* here we know the error block doesn't exist so just set the state here */ slurm_mutex_unlock(&block_state_mutex); rc = put_block_in_error_state(bg_record, reason); slurm_mutex_lock(&block_state_mutex); } } list_destroy(requests); if (delete_list) { slurm_mutex_unlock(&block_state_mutex); free_block_list(NO_VAL, delete_list, 0, 0); list_destroy(delete_list); } slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); last_bg_update = time(NULL); cleanup: FREE_NULL_BITMAP(tmp_record.mp_bitmap); FREE_NULL_BITMAP(tmp_record.ionode_bitmap); return rc; }
/* To effectively deal with heterogeneous nodes, we fake a cyclic * distribution to figure out how many cpus are needed on each node. * * This routine is a slightly modified "version" of the routine * _task_layout_block in src/common/dist_tasks.c. We do not need to * assign tasks to job->hostid[] and job->tids[][] at this point so * the cpu allocation is the same for cyclic and block. * * For the consumable resources support we need to determine what * "node/CPU/Core/thread"-tuplets will be allocated for a given job. * In the past we assumed that we only allocated one task per CPU (at * that point the lowest level of logical processor) and didn't allow * the use of overcommit. We have changed this philosophy and are now * allowing people to overcommit their resources and expect the system * administrator to enable the task/affinity plug-in which will then * bind all of a job's tasks to its allocated resources thereby * avoiding interference between co-allocated running jobs. * * In the consumable resources environment we need to determine the * layout schema within slurmctld. * * We have a core_bitmap of all available cores. All we're doing here * is removing cores that are not needed based on the task count, and * the choice of cores to remove is based on the distribution: * - "cyclic" removes cores "evenly", starting from the last socket, * - "block" removes cores from the "last" socket(s) * - "plane" removes cores "in chunks" */ extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type) { int error_code, cr_cpu = 1; if (((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) || (job_ptr->details->whole_node != 0)) && (job_ptr->details->core_spec == 0)) { /* the job has been allocated an EXCLUSIVE set of nodes, * so it gets all of the bits in the core_bitmap and * all of the available CPUs in the cpus array */ int size = bit_size(job_ptr->job_resrcs->core_bitmap); bit_nset(job_ptr->job_resrcs->core_bitmap, 0, size-1); return SLURM_SUCCESS; } _log_select_maps("cr_dist/start", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); if (job_ptr->details->task_dist == SLURM_DIST_PLANE) { /* perform a plane distribution on the 'cpus' array */ error_code = _compute_plane_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_plane_dist"); return error_code; } } else { /* perform a cyclic distribution on the 'cpus' array */ error_code = _compute_c_b_task_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_c_b_task_dist"); return error_code; } } /* now sync up the core_bitmap with the allocated 'cpus' array * based on the given distribution AND resource setting */ if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET)) cr_cpu = 0; if (cr_cpu) { _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } /* * If SelectTypeParameters mentions to use a block distribution for * cores by default, use that kind of distribution if no particular * cores distribution specified. * Note : cyclic cores distribution, which is the default, is treated * by the next code block */ if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) { switch(job_ptr->details->task_dist) { case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } } /* Determine the number of logical processors per node needed * for this job. Make sure below matches the layouts in * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */ switch(job_ptr->details->task_dist) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: _block_sync_core_bitmap(job_ptr, cr_type); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CFULL: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_UNKNOWN: error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type); break; default: error("select/cons_res: invalid task_dist entry"); return SLURM_ERROR; } _log_select_maps("cr_dist/fini", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); return error_code; }
/* To effectively deal with heterogeneous nodes, we fake a cyclic * distribution to figure out how many cpus are needed on each node. * * This routine is a slightly modified "version" of the routine * _task_layout_block in src/common/dist_tasks.c. We do not need to * assign tasks to job->hostid[] and job->tids[][] at this point so * the cpu allocation is the same for cyclic and block. * * For the consumable resources support we need to determine what * "node/CPU/Core/thread"-tuplets will be allocated for a given job. * In the past we assumed that we only allocated one task per CPU (at * that point the lowest level of logical processor) and didn't allow * the use of overcommit. We have changed this philosophy and are now * allowing people to overcommit their resources and expect the system * administrator to enable the task/affinity plug-in which will then * bind all of a job's tasks to its allocated resources thereby * avoiding interference between co-allocated running jobs. * * In the consumable resources environment we need to determine the * layout schema within slurmctld. * * We have a core_bitmap of all available cores. All we're doing here * is removing cores that are not needed based on the task count, and * the choice of cores to remove is based on the distribution: * - "cyclic" removes cores "evenly", starting from the last socket, * - "block" removes cores from the "last" socket(s) * - "plane" removes cores "in chunks" * * IN job_ptr - job to be allocated resources * IN cr_type - allocation type (sockets, cores, etc.) * IN preempt_mode - true if testing with simulated preempted jobs */ extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type, bool preempt_mode) { int error_code, cr_cpu = 1; if (job_ptr->details->core_spec != (uint16_t) NO_VAL) { /* The job has been allocated all non-specialized cores, * so we don't need to select specific CPUs. */ return SLURM_SUCCESS; } if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) || (job_ptr->details->whole_node == 1)) { int n, i; job_resources_t *job_res = job_ptr->job_resrcs; /* The job has been allocated an EXCLUSIVE set of nodes, * so it gets all of the bits in the core_bitmap and * all of the available CPUs in the cpus array. */ int size = bit_size(job_res->core_bitmap); bit_nset(job_res->core_bitmap, 0, size-1); /* Up to this point we might not have the job_res pointer have * the right cpu count. It is most likely a core count. We * will fix that so we can layout tasks correctly. */ size = bit_size(job_res->node_bitmap); for (i = 0, n = bit_ffs(job_res->node_bitmap); n < size; n++) { if (bit_test(job_res->node_bitmap, n) == 0) continue; job_res->cpus[i++] = select_node_record[n].cpus; } return SLURM_SUCCESS; } _log_select_maps("cr_dist/start", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_PLANE) { /* perform a plane distribution on the 'cpus' array */ error_code = _compute_plane_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_plane_dist"); return error_code; } } else { /* perform a cyclic distribution on the 'cpus' array */ error_code = _compute_c_b_task_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_c_b_task_dist"); return error_code; } } /* now sync up the core_bitmap with the allocated 'cpus' array * based on the given distribution AND resource setting */ if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET)) cr_cpu = 0; if (cr_cpu) { _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } /* * If SelectTypeParameters mentions to use a block distribution for * cores by default, use that kind of distribution if no particular * cores distribution specified. * Note : cyclic cores distribution, which is the default, is treated * by the next code block */ if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) { switch(job_ptr->details->task_dist & SLURM_DIST_NODEMASK) { case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } } /* Determine the number of logical processors per node needed * for this job. Make sure below matches the layouts in * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */ switch(job_ptr->details->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: _block_sync_core_bitmap(job_ptr, cr_type); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CFULL: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_UNKNOWN: error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type, preempt_mode); break; default: error("select/cons_res: invalid task_dist entry"); return SLURM_ERROR; } _log_select_maps("cr_dist/fini", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); return error_code; }