/* * Synchronize BG block state to that of currently active jobs. * This can recover from slurmctld crashes when block usership * changes were queued */ extern int sync_jobs(List job_list) { ListIterator itr; struct job_record *job_ptr = NULL; List block_list = NULL, kill_list = NULL; static bool run_already = false; bg_record_t *bg_record = NULL; /* Execute only on initial startup. We don't support bgblock * creation on demand today, so there is no need to re-sync data. */ if (run_already) return SLURM_SUCCESS; run_already = true; if (!job_list) { error("sync_jobs: no job_list"); return SLURM_ERROR; } slurm_mutex_lock(&block_state_mutex); /* Insure that all running jobs own the specified block */ itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { bg_action_t *bg_action_ptr = NULL; if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; bg_action_ptr = xmalloc(sizeof(bg_action_t)); if (IS_JOB_COMPLETING(job_ptr)) bg_action_ptr->op = TERM_OP; else bg_action_ptr->op = START_OP; bg_action_ptr->job_ptr = job_ptr; get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_ID, &(bg_action_ptr->bg_block_id)); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, &(bg_action_ptr->blrtsimage)); # else get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); # endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, &(bg_action_ptr->linuximage)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, &(bg_action_ptr->ramdiskimage)); #endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, &(bg_action_ptr->mloaderimage)); if (bg_action_ptr->bg_block_id == NULL) { error("Running job %u has bgblock==NULL", job_ptr->job_id); } else if (job_ptr->nodes == NULL) { error("Running job %u has nodes==NULL", job_ptr->job_id); } else if (!(bg_record = find_bg_record_in_list( bg_lists->main, bg_action_ptr->bg_block_id))) { error("Kill job %u belongs to defunct " "bgblock %s", job_ptr->job_id, bg_action_ptr->bg_block_id); } if (!bg_record) { /* Can't fail it just now, we have locks in place. */ bg_status_add_job_kill_list(job_ptr, &kill_list); _destroy_bg_action(bg_action_ptr); continue; } /* _sync_agent will destroy the bg_action_ptr */ _sync_agent(bg_action_ptr, bg_record); } list_iterator_destroy(itr); block_list = list_create(destroy_bg_record); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { bg_record_t *rm_record; if (bg_record->job_ptr || (bg_record->job_list && list_count(bg_record->job_list))) continue; rm_record = xmalloc(sizeof(bg_record_t)); rm_record->magic = BLOCK_MAGIC; rm_record->bg_block_id = xstrdup(bg_record->bg_block_id); rm_record->mp_str = xstrdup(bg_record->mp_str); list_append(block_list, rm_record); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (kill_list) { /* slurmctld is already locked up, so handle this right after * the unlock of block_state_mutex. */ bg_status_process_kill_job_list(kill_list, JOB_BOOT_FAIL, 1); FREE_NULL_LIST(kill_list); } /* Insure that all other blocks are free of users */ if (block_list) { itr = list_iterator_create(block_list); while ((bg_record = list_next(itr))) { info("Queue clearing of users of BG block %s", bg_record->bg_block_id); term_jobs_on_block(bg_record->bg_block_id); } list_iterator_destroy(itr); FREE_NULL_LIST(block_list); } else { /* this should never happen, * vestigial logic */ error("sync_jobs: no block_list"); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* block_state_mutex should be unlocked before calling this */ extern void free_block_list(uint32_t job_id, List track_list, bool destroy, bool wait) { bg_record_t *bg_record = NULL; int retries; ListIterator itr = NULL; bg_free_block_list_t *bg_free_list; pthread_attr_t attr_agent; pthread_t thread_agent; List kill_job_list = NULL; kill_job_struct_t *freeit; if (!track_list || !list_count(track_list)) return; bg_free_list = xmalloc(sizeof(bg_free_block_list_t)); bg_free_list->track_list = list_create(NULL); bg_free_list->destroy = destroy; bg_free_list->job_id = job_id; slurm_mutex_lock(&block_state_mutex); list_transfer(bg_free_list->track_list, track_list); itr = list_iterator_create(bg_free_list->track_list); while ((bg_record = list_next(itr))) { if (bg_record->magic != BLOCK_MAGIC) { error("block was already destroyed %p", bg_record); continue; } bg_record->free_cnt++; /* just so we don't over write a different thread that wants this block destroyed */ if (destroy && !bg_record->destroy) bg_record->destroy = destroy; if (destroy && (bg_record->state & BG_BLOCK_ERROR_FLAG)) resume_block(bg_record); /* This means we are wanting this block free so we can run this job on it, so it is ok to have the job remain here. Only checking for jobs should go below this. */ if (bg_record->modifying) { debug("free_block_list: Just FYI, we are " "freeing a block (%s) that " "has at least one pending job.", bg_record->bg_block_id); continue; } if (bg_record->job_ptr && !IS_JOB_FINISHED(bg_record->job_ptr)) { info("We are freeing a block (%s) that " "has job %u(%u).", bg_record->bg_block_id, bg_record->job_ptr->job_id, bg_record->job_running); if (!kill_job_list) kill_job_list = bg_status_create_kill_job_list(); freeit = xmalloc(sizeof(kill_job_struct_t)); freeit->jobid = bg_record->job_ptr->job_id; list_push(kill_job_list, freeit); } else if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; ListIterator itr; if (!kill_job_list) kill_job_list = bg_status_create_kill_job_list(); info("We are freeing a block (%s) that has at " "least 1 job.", bg_record->bg_block_id); itr = list_iterator_create(bg_record->job_list); while ((job_ptr = list_next(itr))) { if ((job_ptr->magic != JOB_MAGIC) || IS_JOB_FINISHED(job_ptr)) continue; freeit = xmalloc(sizeof(kill_job_struct_t)); freeit->jobid = job_ptr->job_id; list_push(kill_job_list, freeit); } list_iterator_destroy(itr); } } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (kill_job_list) { bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0); FREE_NULL_LIST(kill_job_list); } if (wait) { /* Track_freeing_blocks waits until the list is done and frees the memory of bg_free_list. */ _track_freeing_blocks(bg_free_list); return; } /* _track_freeing_blocks handles cleanup */ slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); retries = 0; while (pthread_create(&thread_agent, &attr_agent, _track_freeing_blocks, bg_free_list)) { error("pthread_create error %m"); if (++retries > MAX_PTHREAD_RETRIES) fatal("Can't create pthread"); /* sleep and retry */ usleep(1000); } slurm_attr_destroy(&attr_agent); return; }
static int _do_block_poll(void) { int updated = 0; #if defined HAVE_BG_FILES int rc; rm_partition_t *block_ptr = NULL; #ifdef HAVE_BGL rm_partition_mode_t node_use; #endif rm_partition_state_t state; char *name = NULL; bg_record_t *bg_record = NULL; ListIterator itr = NULL; if (!bg_lists->main) return updated; lock_slurmctld(job_read_lock); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (bg_record->magic != BLOCK_MAGIC) { /* block is gone */ list_remove(itr); continue; } else if (!bg_record->bg_block_id) continue; name = bg_record->bg_block_id; if ((rc = bridge_get_block_info(name, &block_ptr)) != SLURM_SUCCESS) { if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { switch(rc) { case BG_ERROR_INCONSISTENT_DATA: debug2("got inconsistent data when " "querying block %s", name); continue; break; case BG_ERROR_BLOCK_NOT_FOUND: debug("block %s not found, removing " "from slurm", name); list_remove(itr); destroy_bg_record(bg_record); continue; break; default: break; } } /* If the call was busy, just skip this iteration. It usually means something like rm_get_BG was called which can be a very long call */ if (rc == EBUSY) { debug5("lock was busy, aborting"); break; } error("bridge_get_block_info(%s): %s", name, bg_err_str(rc)); continue; } #ifdef HAVE_BGL if ((rc = bridge_get_data(block_ptr, RM_PartitionMode, &node_use)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionMode): %s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (bg_record->node_use != node_use) { debug("node_use of Block %s was %d " "and now is %d", bg_record->bg_block_id, bg_record->node_use, node_use); bg_record->node_use = node_use; updated = 1; } #else if ((bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) { char *mode = NULL; uint16_t conn_type = SELECT_SMALL; if ((rc = bridge_get_data(block_ptr, RM_PartitionOptions, &mode)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionOptions): " "%s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (mode) { switch(mode[0]) { case 's': conn_type = SELECT_HTC_S; break; case 'd': conn_type = SELECT_HTC_D; break; case 'v': conn_type = SELECT_HTC_V; break; case 'l': conn_type = SELECT_HTC_L; break; default: conn_type = SELECT_SMALL; break; } free(mode); } if (bg_record->conn_type[0] != conn_type) { debug("mode of small Block %s was %u " "and now is %u", bg_record->bg_block_id, bg_record->conn_type[0], conn_type); bg_record->conn_type[0] = conn_type; updated = 1; } } #endif if ((rc = bridge_get_data(block_ptr, RM_PartitionState, &state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionState): %s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (bg_status_update_block_state( bg_record, state, kill_job_list) == 1) updated = 1; next_block: if ((rc = bridge_free_block(block_ptr)) != SLURM_SUCCESS) { error("bridge_free_block(): %s", bg_err_str(rc)); } } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0); #endif return updated; }