/* Update block user and reboot as needed block_state_mutex needs to * be locked before coming in. */ static void _sync_agent(bg_action_t *bg_action_ptr, bg_record_t *bg_record) { struct job_record *job_ptr = bg_action_ptr->job_ptr; debug3("Queue sync of job %u in BG block %s ending at %ld", job_ptr->job_id, bg_action_ptr->bg_block_id, job_ptr->end_time); last_bg_update = time(NULL); ba_sync_job_to_block(bg_record, job_ptr); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, bg_record); num_unused_cpus -= job_ptr->total_cpus; if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) list_push(bg_lists->job_running, bg_record); if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); if (bg_record->state == BG_BLOCK_INITED) { int sync_user_rc; job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); /* Just in case reset the boot flags */ bg_record->boot_state = 0; bg_record->boot_count = 0; sync_user_rc = bridge_block_sync_users(bg_record); if (sync_user_rc == SLURM_ERROR) { slurm_mutex_unlock(&block_state_mutex); (void) slurm_fail_job(job_ptr->job_id, JOB_BOOT_FAIL); slurm_mutex_lock(&block_state_mutex); } _destroy_bg_action(bg_action_ptr); } else { if (bg_record->state != BG_BLOCK_BOOTING) { error("Block %s isn't ready and isn't " "being configured! Starting job again.", bg_action_ptr->bg_block_id); } else { debug("Block %s is booting, job ok", bg_action_ptr->bg_block_id); } /* the function _block_op calls will destroy the bg_action_ptr */ _block_op(bg_action_ptr); } }
/* * Perform any setup required to initiate a job * job_ptr IN - pointer to the job being initiated * RET - SLURM_SUCCESS or an error code * * NOTE: This happens in parallel with srun and slurmd spawning * the job. A prolog script is expected to defer initiation of * the job script until the BG block is available for use. */ extern int start_job(struct job_record *job_ptr) { int rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_action_t *bg_action_ptr = NULL; select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data; slurm_mutex_lock(&block_state_mutex); bg_record = jobinfo->bg_record; if (!bg_record || !block_ptr_exist_in_list(bg_lists->main, bg_record)) { slurm_mutex_unlock(&block_state_mutex); error("bg_record %s doesn't exist, requested for job (%d)", jobinfo->bg_block_id, job_ptr->job_id); return SLURM_ERROR; } if ((jobinfo->conn_type[0] != SELECT_NAV) && (jobinfo->conn_type[0] < SELECT_SMALL)) { int dim; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) jobinfo->conn_type[dim] = bg_record->conn_type[dim]; } /* If it isn't 0 then it was setup previous (sub-block) */ if (jobinfo->geometry[SYSTEM_DIMENSIONS] == 0) memcpy(jobinfo->geometry, bg_record->geo, sizeof(bg_record->geo)); if (bg_record->job_list) { /* Mark the ba_mp cnodes as used now. */ ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list); xassert(ba_mp); xassert(ba_mp->cnode_bitmap); bit_or(ba_mp->cnode_bitmap, jobinfo->units_avail); if (!find_job_in_bg_record(bg_record, job_ptr->job_id)) list_append(bg_record->job_list, job_ptr); } else { bg_record->job_running = job_ptr->job_id; bg_record->job_ptr = job_ptr; } job_ptr->job_state |= JOB_CONFIGURING; bg_action_ptr = xmalloc(sizeof(bg_action_t)); bg_action_ptr->op = START_OP; bg_action_ptr->job_ptr = job_ptr; /* FIXME: The below get_select_jobinfo calls could be avoided * by just using the jobinfo as we do above. */ get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_ID, &(bg_action_ptr->bg_block_id)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_REBOOT, &(bg_action_ptr->reboot)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, &(bg_action_ptr->mloaderimage)); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, &(bg_action_ptr->blrtsimage)); if (!bg_action_ptr->blrtsimage) { bg_action_ptr->blrtsimage = xstrdup(bg_conf->default_blrtsimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, bg_action_ptr->blrtsimage); } # elif defined HAVE_BGP get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); # endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, &(bg_action_ptr->linuximage)); if (!bg_action_ptr->linuximage) { bg_action_ptr->linuximage = xstrdup(bg_conf->default_linuximage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, bg_action_ptr->linuximage); } get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, &(bg_action_ptr->ramdiskimage)); if (!bg_action_ptr->ramdiskimage) { bg_action_ptr->ramdiskimage = xstrdup(bg_conf->default_ramdiskimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, bg_action_ptr->ramdiskimage); } #endif if (!bg_action_ptr->mloaderimage) { bg_action_ptr->mloaderimage = xstrdup(bg_conf->default_mloaderimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, bg_action_ptr->mloaderimage); } num_unused_cpus -= job_ptr->total_cpus; if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) list_push(bg_lists->job_running, bg_record); if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); /* Just in case something happens to free this block before we start the job we will make it so this job doesn't get blown away. */ bg_record->modifying = 1; last_bg_update = time(NULL); slurm_mutex_unlock(&block_state_mutex); info("Queue start of job %u in BG block %s", job_ptr->job_id, bg_action_ptr->bg_block_id); _block_op(bg_action_ptr); return rc; }
/* * Perform any setup required to initiate a job * job_ptr IN - pointer to the job being initiated * RET - SLURM_SUCCESS or an error code * * NOTE: This happens in parallel with srun and slurmd spawning * the job. A prolog script is expected to defer initiation of * the job script until the BG block is available for use. */ extern int start_job(struct job_record *job_ptr) { int rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_action_t *bg_action_ptr = NULL; bg_action_ptr = xmalloc(sizeof(bg_action_t)); bg_action_ptr->op = START_OP; bg_action_ptr->job_ptr = job_ptr; get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_ID, &(bg_action_ptr->bg_block_id)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_REBOOT, &(bg_action_ptr->reboot)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, &(bg_action_ptr->mloaderimage)); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, &(bg_action_ptr->blrtsimage)); if (!bg_action_ptr->blrtsimage) { bg_action_ptr->blrtsimage = xstrdup(bg_conf->default_blrtsimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, bg_action_ptr->blrtsimage); } # elif defined HAVE_BGP get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); # endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, &(bg_action_ptr->linuximage)); if (!bg_action_ptr->linuximage) { bg_action_ptr->linuximage = xstrdup(bg_conf->default_linuximage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, bg_action_ptr->linuximage); } get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, &(bg_action_ptr->ramdiskimage)); if (!bg_action_ptr->ramdiskimage) { bg_action_ptr->ramdiskimage = xstrdup(bg_conf->default_ramdiskimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, bg_action_ptr->ramdiskimage); } #endif if (!bg_action_ptr->mloaderimage) { bg_action_ptr->mloaderimage = xstrdup(bg_conf->default_mloaderimage); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, bg_action_ptr->mloaderimage); } slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { slurm_mutex_unlock(&block_state_mutex); error("bg_record %s doesn't exist, requested for job (%d)", bg_action_ptr->bg_block_id, job_ptr->job_id); _destroy_bg_action(bg_action_ptr); return SLURM_ERROR; } last_bg_update = time(NULL); if (bg_record->job_list) { if (!find_job_in_bg_record(bg_record, job_ptr->job_id)) list_append(bg_record->job_list, job_ptr); } else { bg_record->job_running = bg_action_ptr->job_ptr->job_id; bg_record->job_ptr = bg_action_ptr->job_ptr; } num_unused_cpus -= job_ptr->total_cpus; if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) list_push(bg_lists->job_running, bg_record); if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); /* Just incase something happens to free this block before we start the job we will make it so this job doesn't get blown away. */ bg_record->modifying = 1; slurm_mutex_unlock(&block_state_mutex); info("Queue start of job %u in BG block %s", job_ptr->job_id, bg_action_ptr->bg_block_id); _block_op(bg_action_ptr); return rc; }
/* * Try to find resources for a given job request * IN job_ptr - pointer to job record in slurmctld * IN/OUT bitmap - nodes available for assignment to job, clear those not to * be used * IN min_nodes, max_nodes - minimum and maximum number of nodes to allocate * to this job (considers slurm block limits) * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now * SELECT_MODE_TEST_ONLY: test if job can ever run * SELECT_MODE_WILL_RUN: determine when and where job can run * IN preemptee_candidates - List of pointers to jobs which can be preempted. * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the * jobs to be preempted to initiate the pending job. Not set * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL. * RET - SLURM_SUCCESS if job runnable now, error code otherwise */ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t mode, List preemptee_candidates, List *preemptee_job_list) { int rc = SLURM_SUCCESS; bg_record_t* bg_record = NULL; char buf[256]; uint16_t conn_type[SYSTEM_DIMENSIONS]; List block_list = NULL; int blocks_added = 0; time_t starttime = time(NULL); uint16_t local_mode = mode; int avail_cpus = num_unused_cpus; int dim = 0; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = (uint16_t)NO_VAL; if (preemptee_candidates && preemptee_job_list && list_count(preemptee_candidates)) local_mode |= SELECT_MODE_PREEMPT_FLAG; else local_mode |= SELECT_MODE_CHECK_FULL; if (bg_conf->layout_mode == LAYOUT_DYNAMIC) slurm_mutex_lock(&create_dynamic_mutex); slurm_mutex_lock(&block_state_mutex); block_list = copy_bg_list(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &conn_type); if (conn_type[0] == SELECT_NAV) { if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) conn_type[0] = SELECT_SMALL; else if (min_nodes > 1) { for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = SELECT_TORUS; } else if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp) conn_type[0] = SELECT_SMALL; else { for (dim=1; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = SELECT_NAV; } set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &conn_type); } if (slurm_block_bitmap && !bit_set_count(slurm_block_bitmap)) { error("no nodes given to place job %u.", job_ptr->job_id); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) slurm_mutex_unlock(&create_dynamic_mutex); return SLURM_ERROR; } sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_MIXED); debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u", job_ptr->job_id, local_mode, buf, min_nodes, req_nodes, max_nodes); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_BLRTS_IMAGE); debug3("BlrtsImage=%s", buf); # endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_LINUX_IMAGE); # ifdef HAVE_BGL debug3("LinuxImage=%s", buf); # else debug3("ComputNodeImage=%s", buf); # endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_RAMDISK_IMAGE); # ifdef HAVE_BGL debug3("RamDiskImage=%s", buf); # else debug3("RamDiskIoLoadImage=%s", buf); # endif #endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_MLOADER_IMAGE); debug3("MloaderImage=%s", buf); /* First look at the empty space, and then remove the preemptable jobs and try again. */ list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); rc = _find_best_block_match(block_list, &blocks_added, job_ptr, slurm_block_bitmap, min_nodes, max_nodes, req_nodes, &bg_record, local_mode, avail_cpus); if (rc == SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) { ListIterator itr; ListIterator job_itr; bg_record_t *found_record; struct job_record *preempt_job_ptr; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("doing preemption"); local_mode |= SELECT_MODE_CHECK_FULL; job_itr = list_iterator_create(preemptee_candidates); itr = list_iterator_create(block_list); while ((preempt_job_ptr = list_next(job_itr))) { while ((found_record = list_next(itr))) { if (found_record->job_ptr == preempt_job_ptr) { /* info("removing job %u running on %s", */ /* preempt_job_ptr->job_id, */ /* found_record->bg_block_id); */ found_record->job_ptr = NULL; found_record->job_running = NO_JOB_RUNNING; avail_cpus += found_record->cpu_cnt; break; } } if (!found_record) { list_iterator_reset(itr); error("Job %u wasn't found running anywhere, " "can't preempt", preempt_job_ptr->job_id); continue; } else if (job_ptr->details->min_cpus > avail_cpus) continue; list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); if ((rc = _find_best_block_match( block_list, &blocks_added, job_ptr, slurm_block_bitmap, min_nodes, max_nodes, req_nodes, &bg_record, local_mode, avail_cpus)) == SLURM_SUCCESS) break; list_iterator_reset(itr); } list_iterator_destroy(itr); list_iterator_destroy(job_itr); } if (rc == SLURM_SUCCESS) { if (!bg_record) fatal("we got a success, but no block back"); /* Here we see if there is a job running since * some jobs take awhile to finish we need to * make sure the time of the end is in the * future. If it isn't (meaning it is in the * past or current time) we add 5 seconds to * it so we don't use the block immediately. */ if (bg_record->job_ptr && bg_record->job_ptr->end_time) { if (bg_record->job_ptr->end_time <= starttime) starttime += 5; else starttime = bg_record->job_ptr->end_time; } else if (bg_record->job_running == BLOCK_ERROR_STATE) starttime = INFINITE; /* make sure the job is eligible to run */ if (job_ptr->details->begin_time > starttime) starttime = job_ptr->details->begin_time; job_ptr->start_time = starttime; set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODES, bg_record->mp_str); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_IONODES, bg_record->ionode_str); if (!bg_record->bg_block_id) { debug("%d can start unassigned job %u " "at %ld on %s", local_mode, job_ptr->job_id, starttime, bg_record->mp_str); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, NULL); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODE_CNT, &bg_record->cnode_cnt); } else { if ((bg_record->ionode_str) && (job_ptr->part_ptr->max_share <= 1)) error("Small block used in " "non-shared partition"); debug("%d(%d) can start job %u " "at %ld on %s(%s) %d", local_mode, mode, job_ptr->job_id, starttime, bg_record->bg_block_id, bg_record->mp_str, SELECT_IS_MODE_RUN_NOW(local_mode)); if (SELECT_IS_MODE_RUN_NOW(local_mode)) { /* Set this up to be the correct pointer since we probably are working off a copy. */ if (bg_record->original) bg_record = bg_record->original; set_select_jobinfo( job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, bg_record); if (job_ptr) { bg_record->job_running = job_ptr->job_id; bg_record->job_ptr = job_ptr; job_ptr->job_state |= JOB_CONFIGURING; last_bg_update = time(NULL); } } else { set_select_jobinfo( job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, NULL); /* Just to make sure we don't end up using this on another job, or we have to wait until preemption is done. */ bg_record->job_ptr = NULL; bg_record->job_running = NO_JOB_RUNNING; } set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODE_CNT, &bg_record->cnode_cnt); } if (SELECT_IS_MODE_RUN_NOW(local_mode)) _build_select_struct(job_ptr, slurm_block_bitmap, bg_record->cnode_cnt); /* set up the preempted job list */ if (SELECT_IS_PREEMPT_SET(local_mode)) { if (*preemptee_job_list) list_destroy(*preemptee_job_list); *preemptee_job_list = _get_preemptables( local_mode, bg_record, preemptee_candidates); } if (!bg_record->bg_block_id) { /* This is a fake record so we need to * destroy it after we get the info from * it. If it was just testing then * we added this record to the * block_list. If this is the case * it will be handled if se sync the * lists. But we don't want to do * that so we will set blocks_added to * 0 so it doesn't happen. */ if (!blocks_added) { destroy_bg_record(bg_record); bg_record = NULL; } blocks_added = 0; } last_job_update = time(NULL); } if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { slurm_mutex_lock(&block_state_mutex); if (blocks_added) _sync_block_lists(block_list, bg_lists->main); slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&create_dynamic_mutex); } list_destroy(block_list); return rc; }
/* Update block user and reboot as needed */ static void _sync_agent(bg_action_t *bg_action_ptr) { bg_record_t * bg_record = NULL; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { slurm_mutex_unlock(&block_state_mutex); error("No block %s", bg_action_ptr->bg_block_id); bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1); return; } last_bg_update = time(NULL); bg_action_ptr->job_ptr->total_cpus = bg_action_ptr->job_ptr->details->min_cpus = bg_record->cpu_cnt; bg_record->job_running = bg_action_ptr->job_ptr->job_id; bg_record->job_ptr = bg_action_ptr->job_ptr; set_select_jobinfo(bg_record->job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, bg_record); if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) { list_push(bg_lists->job_running, bg_record); num_unused_cpus -= bg_record->cpu_cnt; } if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); if (bg_record->state == BG_BLOCK_INITED) { if (bg_record->job_ptr) { bg_record->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); } if (bg_record->user_uid != bg_action_ptr->job_ptr->user_id) { int set_user_rc = SLURM_SUCCESS; debug("User isn't correct for job %d on %s, " "fixing...", bg_action_ptr->job_ptr->job_id, bg_action_ptr->bg_block_id); xfree(bg_record->target_name); bg_record->target_name = uid_to_string(bg_action_ptr->job_ptr->user_id); set_user_rc = set_block_user(bg_record); slurm_mutex_unlock(&block_state_mutex); if (set_user_rc == SLURM_ERROR) (void) slurm_fail_job(bg_record->job_running); } else slurm_mutex_unlock(&block_state_mutex); } else { if (bg_record->state != BG_BLOCK_BOOTING) { error("Block %s isn't ready and isn't " "being configured! Starting job again.", bg_action_ptr->bg_block_id); } else { debug("Block %s is booting, job ok", bg_action_ptr->bg_block_id); } slurm_mutex_unlock(&block_state_mutex); _start_agent(bg_action_ptr); } }