static void *_block_agent(void *args) { bg_action_t *bg_action_ptr = (bg_action_t *)args; if (bg_action_ptr->op == START_OP) _start_agent(bg_action_ptr); else if (bg_action_ptr->op == TERM_OP) bridge_block_post_job(bg_action_ptr->bg_block_id); else if (bg_action_ptr->op == SYNC_OP) _sync_agent(bg_action_ptr); _destroy_bg_action(bg_action_ptr); return NULL; }
/* * Synchronize BG block state to that of currently active jobs. * This can recover from slurmctld crashes when block usership * changes were queued */ extern int sync_jobs(List job_list) { ListIterator itr; struct job_record *job_ptr = NULL; List block_list = NULL, kill_list = NULL; static bool run_already = false; bg_record_t *bg_record = NULL; /* Execute only on initial startup. We don't support bgblock * creation on demand today, so there is no need to re-sync data. */ if (run_already) return SLURM_SUCCESS; run_already = true; if (!job_list) { error("sync_jobs: no job_list"); return SLURM_ERROR; } slurm_mutex_lock(&block_state_mutex); /* Insure that all running jobs own the specified block */ itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { bg_action_t *bg_action_ptr = NULL; if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; bg_action_ptr = xmalloc(sizeof(bg_action_t)); if (IS_JOB_COMPLETING(job_ptr)) bg_action_ptr->op = TERM_OP; else bg_action_ptr->op = START_OP; bg_action_ptr->job_ptr = job_ptr; get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_ID, &(bg_action_ptr->bg_block_id)); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, &(bg_action_ptr->blrtsimage)); # else get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); # endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, &(bg_action_ptr->linuximage)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, &(bg_action_ptr->ramdiskimage)); #endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, &(bg_action_ptr->mloaderimage)); if (bg_action_ptr->bg_block_id == NULL) { error("Running job %u has bgblock==NULL", job_ptr->job_id); } else if (job_ptr->nodes == NULL) { error("Running job %u has nodes==NULL", job_ptr->job_id); } else if (!(bg_record = find_bg_record_in_list( bg_lists->main, bg_action_ptr->bg_block_id))) { error("Kill job %u belongs to defunct " "bgblock %s", job_ptr->job_id, bg_action_ptr->bg_block_id); } if (!bg_record) { /* Can't fail it just now, we have locks in place. */ bg_status_add_job_kill_list(job_ptr, &kill_list); _destroy_bg_action(bg_action_ptr); continue; } /* _sync_agent will destroy the bg_action_ptr */ _sync_agent(bg_action_ptr, bg_record); } list_iterator_destroy(itr); block_list = list_create(destroy_bg_record); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { bg_record_t *rm_record; if (bg_record->job_ptr || (bg_record->job_list && list_count(bg_record->job_list))) continue; rm_record = xmalloc(sizeof(bg_record_t)); rm_record->magic = BLOCK_MAGIC; rm_record->bg_block_id = xstrdup(bg_record->bg_block_id); rm_record->mp_str = xstrdup(bg_record->mp_str); list_append(block_list, rm_record); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (kill_list) { /* slurmctld is already locked up, so handle this right after * the unlock of block_state_mutex. */ bg_status_process_kill_job_list(kill_list, JOB_BOOT_FAIL, 1); FREE_NULL_LIST(kill_list); } /* Insure that all other blocks are free of users */ if (block_list) { itr = list_iterator_create(block_list); while ((bg_record = list_next(itr))) { info("Queue clearing of users of BG block %s", bg_record->bg_block_id); term_jobs_on_block(bg_record->bg_block_id); } list_iterator_destroy(itr); FREE_NULL_LIST(block_list); } else { /* this should never happen, * vestigial logic */ error("sync_jobs: no block_list"); return SLURM_ERROR; } return SLURM_SUCCESS; }