/* Notify the gang scheduler that a job has been started */ extern int gs_job_start(struct job_record *job_ptr) { struct gs_part *p_ptr; uint16_t job_state; if (gs_debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_job_start for job %u", job_ptr->job_id); /* add job to partition */ pthread_mutex_lock(&data_mutex); p_ptr = list_find_first(gs_part_list, _find_gs_part, job_ptr->partition); if (p_ptr) { job_state = _add_job_to_part(p_ptr, job_ptr); /* if this job is running then check for preemption */ if (job_state == GS_RESUME) _update_all_active_rows(); } pthread_mutex_unlock(&data_mutex); if (!p_ptr) { /* No partition was found for this job, so let it run * uninterupted (what else can we do?) */ error("gang: could not find partition %s for job %u", job_ptr->partition, job_ptr->job_id); } _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (gs_debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_job_start"); return SLURM_SUCCESS; }
/* Notify the gang scheduler that a job has been resumed or started. * In either case, add the job to gang scheduling. */ extern void gs_job_start(struct job_record *job_ptr) { struct gs_part *p_ptr; uint16_t job_sig_state; char *part_name; if (!(slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG)) return; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_job_start for job %u", job_ptr->job_id); /* add job to partition */ if (job_ptr->part_ptr && job_ptr->part_ptr->name) part_name = job_ptr->part_ptr->name; else part_name = job_ptr->partition; slurm_mutex_lock(&data_mutex); p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (p_ptr) { job_sig_state = _add_job_to_part(p_ptr, job_ptr); /* if this job is running then check for preemption */ if (job_sig_state == GS_RESUME) _update_all_active_rows(); } slurm_mutex_unlock(&data_mutex); if (!p_ptr) { /* No partition was found for this job, so let it run * uninterupted (what else can we do?) */ error("gang: could not find partition %s for job %u", part_name, job_ptr->job_id); } _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_job_start"); }
/* rebuild data structures from scratch * * A reconfigure can affect this plugin in these ways: * - partitions can be added or removed * - this affects the gs_part_list * - nodes can be removed from a partition, or added to a partition * - this affects the size of the active resmap * * Here's the plan: * 1. save a copy of the global structures, and then construct * new ones. * 2. load the new partition structures with existing jobs, * confirming the job exists and resizing their resmaps * (if necessary). * 3. make sure all partitions are accounted for. If a partition * was removed, make sure any jobs that were in the queue and * that were suspended are resumed. Conversely, if a partition * was added, check for existing jobs that may be contending * for resources that we could begin timeslicing. * 4. delete the old global structures and return. */ extern int gs_reconfig(void) { int i; ListIterator part_iterator; struct gs_part *p_ptr, *newp_ptr; List old_part_list; struct job_record *job_ptr; struct gs_job *j_ptr; if (!timeslicer_thread_id) { /* gs_init() will be called later from read_slurm_conf() * if we are enabling gang scheduling via reconfiguration */ return SLURM_SUCCESS; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_reconfig"); pthread_mutex_lock(&data_mutex); old_part_list = gs_part_list; gs_part_list = NULL; /* reset global data */ gs_fast_schedule = slurm_get_fast_schedule(); gr_type = _get_gr_type(); _load_phys_res_cnt(); _build_parts(); /* scan the old part list and add existing jobs to the new list */ part_iterator = list_iterator_create(old_part_list); while ((p_ptr = (struct gs_part *) list_next(part_iterator))) { newp_ptr = (struct gs_part *) list_find_first(gs_part_list, _find_gs_part, p_ptr->part_name); if (!newp_ptr) { /* this partition was removed, so resume * any jobs suspended by gang and continue */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->sig_state == GS_SUSPEND) && (j_ptr->job_ptr->priority != 0)) { info("resuming job in missing part %s", p_ptr->part_name); _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; } } continue; } if (p_ptr->num_jobs == 0) /* no jobs to transfer */ continue; /* we need to transfer the jobs from p_ptr to new_ptr and * adjust their resmaps (if necessary). then we need to create * the active resmap and adjust the state of each job (if * necessary). NOTE: there could be jobs that only overlap * on nodes that are no longer in the partition, but we're * not going to worry about those cases. * * add the jobs from p_ptr into new_ptr in their current order * to preserve the state of timeslicing. */ for (i = 0; i < p_ptr->num_jobs; i++) { job_ptr = find_job_record(p_ptr->job_list[i]->job_id); if (job_ptr == NULL) { /* job no longer exists in SLURM, so drop it */ continue; } /* resume any job that is suspended by us */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG){ info("resuming job %u apparently " "suspended by gang", job_ptr->job_id); } _resume_job(job_ptr->job_id); } /* transfer the job as long as it is still active */ if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { _add_job_to_part(newp_ptr, job_ptr); } } } list_iterator_destroy(part_iterator); /* confirm all jobs. Scan the master job_list and confirm that we * are tracking all jobs */ _scan_slurm_job_list(); FREE_NULL_LIST(old_part_list); pthread_mutex_unlock(&data_mutex); _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_reconfig"); return SLURM_SUCCESS; }
/* ensure that all jobs running in SLURM are accounted for. * this procedure assumes that the gs data has already been * locked by the caller! */ static void _scan_slurm_job_list(void) { struct job_record *job_ptr; struct gs_part *p_ptr; int i; ListIterator job_iterator; char *part_name; if (!job_list) { /* no jobs */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list NULL"); return; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list exists..."); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _scan_slurm_job_list: checking job %u", job_ptr->job_id); } if (IS_JOB_PENDING(job_ptr)) continue; if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0)) continue; /* not suspended by us */ if (job_ptr->part_ptr && job_ptr->part_ptr->name) part_name = job_ptr->part_ptr->name; else part_name = job_ptr->partition; if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { /* are we tracking this job already? */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; i = _find_job_index(p_ptr, job_ptr->job_id); if (i >= 0) /* we're tracking it, so continue */ continue; /* We're not tracking this job. Resume it if it's * suspended, and then add it to the job list. */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { /* The likely scenario here is that the * failed over, and this is a job that gang * had previously suspended. It's not possible * to determine the previous order of jobs * without preserving gang state, which is not * worth the extra infrastructure. Just resume * the job and then add it to the job list. */ _resume_job(job_ptr->job_id); } _add_job_to_part(p_ptr, job_ptr); continue; } /* if the job is not pending, suspended, or running, then * it's completing or completed. Make sure we've released * this job */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; _remove_job_from_part(job_ptr->job_id, p_ptr, false); } list_iterator_destroy(job_iterator); /* now that all of the old jobs have been flushed out, * update the active row of all partitions */ _update_all_active_rows(); return; }