int main(int argc, char * argv[]) { if (argc != 6) { printf("Usage: %s, control_addr job_id1 job_id2 sched_port is_bluegene\n", argv[0]); exit(1); } control_addr = argv[1]; job_id1 = atoi(argv[2]); job_id2 = atoi(argv[3]); sched_port = atoi(argv[4]); is_bluegene = atoi(argv[5]); printf("control_addr=%s job_id=%ld,%ld sched_port=%d is_bluegene=%d\n", control_addr, job_id1, job_id2, sched_port, is_bluegene); _get_jobs(); _get_nodes(); _modify_job(job_id1); _get_jobs(); _start_job(job_id1); if (!is_bluegene) { _suspend_job(job_id1); _resume_job(job_id1); } _cancel_job(job_id2); sleep(5); _get_jobs(); printf("SUCCESS\n"); exit(0); }
int main(int argc, char * argv[]) { if (argc < 6) { printf("Usage: %s, auth_key control_addr e_port " "job_id sched_port is_bluegene\n", argv[0]); exit(1); } auth_key = argv[1]; control_addr = argv[2]; e_port = atoi(argv[3]); job_id = atoi(argv[4]); sched_port = atoi(argv[5]); is_bluegene = atoi(argv[6]); printf("auth_key=%s control_addr=%s e_port=%d job_id=%d sched_port=%d " "is_bluegene=%d\n", auth_key, control_addr, e_port, job_id, sched_port, is_bluegene); #if _DEBUG _single_msg(); #else _initialize(); _get_jobs(); _get_nodes(); _job_will_run(job_id); _modify_job(job_id); _get_jobs(); _start_job(job_id); _get_jobs(); if (!is_bluegene) { _suspend_job(job_id); _resume_job(job_id); } _notify_job(job_id); _signal_job(job_id); if (e_port) _event_mgr(); else { printf("READY\n"); sleep(3); } _cancel_job(job_id+1); _job_requeue(job_id); /* Put job back into HELD state */ sleep(15); _start_job(job_id); _get_jobs(); #endif printf("SUCCESS\n"); exit(0); }
/* Gang scheduling has been disabled by change in configuration, * resume any suspended jobs */ extern void gs_wake_jobs(void) { struct job_record *job_ptr; ListIterator job_iterator; if (!job_list) /* no jobs */ return; job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)) { info("gang waking preempted job %u", job_ptr->job_id); _resume_job(job_ptr->job_id); } } list_iterator_destroy(job_iterator); }
/* remove the given job from the given partition * IN job_id - job to remove * IN p_ptr - GS partition structure * IN fini - true is job is in finish state (e.g. not to be resumed) */ static void _remove_job_from_part(uint32_t job_id, struct gs_part *p_ptr, bool fini) { int i; struct gs_job *j_ptr; if (!job_id || !p_ptr) return; /* find the job in the job_list */ i = _find_job_index(p_ptr, job_id); if (i < 0) /* job not found */ return; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _remove_job_from_part: removing job %u from %s", job_id, p_ptr->part_name); } j_ptr = p_ptr->job_list[i]; /* remove any shadow first */ _clear_shadow(j_ptr); /* remove the job from the job_list by shifting everyone else down */ p_ptr->num_jobs--; for (; i < p_ptr->num_jobs; i++) { p_ptr->job_list[i] = p_ptr->job_list[i+1]; } p_ptr->job_list[i] = NULL; /* make sure the job is not suspended by gang, and then delete it */ if (!fini && (j_ptr->sig_state == GS_SUSPEND) && j_ptr->job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _remove_job_from_part: resuming " "suspended job %u", j_ptr->job_id); } _resume_job(j_ptr->job_id); } j_ptr->job_ptr = NULL; xfree(j_ptr); return; }
/* Rebuild the active row BUT preserve the order of existing jobs. * This is called after one or more jobs have been removed from * the partition or if a higher priority "shadow" has been added * which could preempt running jobs. */ static void _update_active_row(struct gs_part *p_ptr, int add_new_jobs) { int i; struct gs_job *j_ptr; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: update_active_row: rebuilding part %s...", p_ptr->part_name); } /* rebuild the active row, starting with any shadows */ p_ptr->jobs_active = 0; for (i = 0; p_ptr->shadow && p_ptr->shadow[i]; i++) { _add_job_to_active(p_ptr->shadow[i]->job_ptr, p_ptr); } /* attempt to add the existing 'active' jobs */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if (j_ptr->row_state != GS_ACTIVE) continue; if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) { _add_job_to_active(j_ptr->job_ptr, p_ptr); _cast_shadow(j_ptr, p_ptr->priority); } else { /* this job has been preempted by a shadow job. * suspend it and preserve it's job_list order */ if (j_ptr->sig_state != GS_SUSPEND) { if (p_ptr->num_shadows && (slurm_job_preempt_mode(j_ptr->job_ptr) != PREEMPT_MODE_SUSPEND)) { _preempt_job_queue(j_ptr->job_id); } else _suspend_job(j_ptr->job_id); j_ptr->sig_state = GS_SUSPEND; _clear_shadow(j_ptr); } j_ptr->row_state = GS_NO_ACTIVE; } } /* attempt to add the existing 'filler' jobs */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if (j_ptr->row_state != GS_FILLER) continue; if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) { _add_job_to_active(j_ptr->job_ptr, p_ptr); _cast_shadow(j_ptr, p_ptr->priority); } else { /* this job has been preempted by a shadow job. * suspend it and preserve it's job_list order */ if (j_ptr->sig_state != GS_SUSPEND) { if (p_ptr->num_shadows && (slurm_job_preempt_mode(j_ptr->job_ptr) != PREEMPT_MODE_SUSPEND)) { _preempt_job_queue(j_ptr->job_id); } else _suspend_job(j_ptr->job_id); j_ptr->sig_state = GS_SUSPEND; _clear_shadow(j_ptr); } j_ptr->row_state = GS_NO_ACTIVE; } } if (!add_new_jobs) return; /* attempt to add any new jobs */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->row_state != GS_NO_ACTIVE) || (j_ptr->job_ptr->priority == 0)) continue; if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) { _add_job_to_active(j_ptr->job_ptr, p_ptr); _cast_shadow(j_ptr, p_ptr->priority); /* note that this job is a "filler" for this row, * blocked by a higher priority job */ j_ptr->row_state = GS_FILLER; /* resume the job */ if (j_ptr->sig_state == GS_SUSPEND) { _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; } } } }
/* _cycle_job_list * * This is the heart of the timeslicer. The algorithm works as follows: * * 1. Each new job is added to the end of the job list, so the earliest job * is at the front of the list. * 2. Any "shadow" jobs are first applied to the active_resmap. Then the * active_resmap is filled out by starting with the first job in the list, * and adding to it any job that doesn't conflict with the resources. * 3. When the timeslice has passed, all jobs that were added to the active * resmap are moved to the back of the list (preserving their order among * each other). * 4. Loop back to step 2, starting with the new "first job in the list". */ static void _cycle_job_list(struct gs_part *p_ptr) { int i, j; struct gs_job *j_ptr; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering _cycle_job_list"); /* re-prioritize the job_list and set all row_states to GS_NO_ACTIVE */ for (i = 0; i < p_ptr->num_jobs; i++) { while (p_ptr->job_list[i]->row_state == GS_ACTIVE) { /* move this job to the back row and "deactivate" it */ j_ptr = p_ptr->job_list[i]; j_ptr->row_state = GS_NO_ACTIVE; for (j = i; j+1 < p_ptr->num_jobs; j++) { p_ptr->job_list[j] = p_ptr->job_list[j+1]; } p_ptr->job_list[j] = j_ptr; } if (p_ptr->job_list[i]->row_state == GS_FILLER) p_ptr->job_list[i]->row_state = GS_NO_ACTIVE; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _cycle_job_list reordered job list:"); /* Rebuild the active row. */ _build_active_row(p_ptr); if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _cycle_job_list new active job list:"); _print_jobs(p_ptr); /* Suspend running jobs that are GS_NO_ACTIVE */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->row_state == GS_NO_ACTIVE) && (j_ptr->sig_state == GS_RESUME)) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _cycle_job_list: suspending job %u", j_ptr->job_id); } if (p_ptr->num_shadows && (slurm_job_preempt_mode(j_ptr->job_ptr) != PREEMPT_MODE_SUSPEND)) { _preempt_job_queue(j_ptr->job_id); } else _suspend_job(j_ptr->job_id); j_ptr->sig_state = GS_SUSPEND; _clear_shadow(j_ptr); } } /* Resume suspended jobs that are GS_ACTIVE */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->row_state == GS_ACTIVE) && (j_ptr->sig_state == GS_SUSPEND) && (j_ptr->job_ptr->priority != 0)) { /* Redundant check */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _cycle_job_list: resuming job %u", j_ptr->job_id); } _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; _cast_shadow(j_ptr, p_ptr->priority); } } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving _cycle_job_list"); }
/* rebuild data structures from scratch * * A reconfigure can affect this plugin in these ways: * - partitions can be added or removed * - this affects the gs_part_list * - nodes can be removed from a partition, or added to a partition * - this affects the size of the active resmap * * Here's the plan: * 1. save a copy of the global structures, and then construct * new ones. * 2. load the new partition structures with existing jobs, * confirming the job exists and resizing their resmaps * (if necessary). * 3. make sure all partitions are accounted for. If a partition * was removed, make sure any jobs that were in the queue and * that were suspended are resumed. Conversely, if a partition * was added, check for existing jobs that may be contending * for resources that we could begin timeslicing. * 4. delete the old global structures and return. */ extern int gs_reconfig(void) { int i; ListIterator part_iterator; struct gs_part *p_ptr, *newp_ptr; List old_part_list; struct job_record *job_ptr; struct gs_job *j_ptr; if (!timeslicer_thread_id) { /* gs_init() will be called later from read_slurm_conf() * if we are enabling gang scheduling via reconfiguration */ return SLURM_SUCCESS; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_reconfig"); pthread_mutex_lock(&data_mutex); old_part_list = gs_part_list; gs_part_list = NULL; /* reset global data */ gs_fast_schedule = slurm_get_fast_schedule(); gr_type = _get_gr_type(); _load_phys_res_cnt(); _build_parts(); /* scan the old part list and add existing jobs to the new list */ part_iterator = list_iterator_create(old_part_list); while ((p_ptr = (struct gs_part *) list_next(part_iterator))) { newp_ptr = (struct gs_part *) list_find_first(gs_part_list, _find_gs_part, p_ptr->part_name); if (!newp_ptr) { /* this partition was removed, so resume * any jobs suspended by gang and continue */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->sig_state == GS_SUSPEND) && (j_ptr->job_ptr->priority != 0)) { info("resuming job in missing part %s", p_ptr->part_name); _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; } } continue; } if (p_ptr->num_jobs == 0) /* no jobs to transfer */ continue; /* we need to transfer the jobs from p_ptr to new_ptr and * adjust their resmaps (if necessary). then we need to create * the active resmap and adjust the state of each job (if * necessary). NOTE: there could be jobs that only overlap * on nodes that are no longer in the partition, but we're * not going to worry about those cases. * * add the jobs from p_ptr into new_ptr in their current order * to preserve the state of timeslicing. */ for (i = 0; i < p_ptr->num_jobs; i++) { job_ptr = find_job_record(p_ptr->job_list[i]->job_id); if (job_ptr == NULL) { /* job no longer exists in SLURM, so drop it */ continue; } /* resume any job that is suspended by us */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG){ info("resuming job %u apparently " "suspended by gang", job_ptr->job_id); } _resume_job(job_ptr->job_id); } /* transfer the job as long as it is still active */ if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { _add_job_to_part(newp_ptr, job_ptr); } } } list_iterator_destroy(part_iterator); /* confirm all jobs. Scan the master job_list and confirm that we * are tracking all jobs */ _scan_slurm_job_list(); FREE_NULL_LIST(old_part_list); pthread_mutex_unlock(&data_mutex); _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_reconfig"); return SLURM_SUCCESS; }
/* ensure that all jobs running in SLURM are accounted for. * this procedure assumes that the gs data has already been * locked by the caller! */ static void _scan_slurm_job_list(void) { struct job_record *job_ptr; struct gs_part *p_ptr; int i; ListIterator job_iterator; char *part_name; if (!job_list) { /* no jobs */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list NULL"); return; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list exists..."); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _scan_slurm_job_list: checking job %u", job_ptr->job_id); } if (IS_JOB_PENDING(job_ptr)) continue; if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0)) continue; /* not suspended by us */ if (job_ptr->part_ptr && job_ptr->part_ptr->name) part_name = job_ptr->part_ptr->name; else part_name = job_ptr->partition; if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { /* are we tracking this job already? */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; i = _find_job_index(p_ptr, job_ptr->job_id); if (i >= 0) /* we're tracking it, so continue */ continue; /* We're not tracking this job. Resume it if it's * suspended, and then add it to the job list. */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { /* The likely scenario here is that the * failed over, and this is a job that gang * had previously suspended. It's not possible * to determine the previous order of jobs * without preserving gang state, which is not * worth the extra infrastructure. Just resume * the job and then add it to the job list. */ _resume_job(job_ptr->job_id); } _add_job_to_part(p_ptr, job_ptr); continue; } /* if the job is not pending, suspended, or running, then * it's completing or completed. Make sure we've released * this job */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; _remove_job_from_part(job_ptr->job_id, p_ptr, false); } list_iterator_destroy(job_iterator); /* now that all of the old jobs have been flushed out, * update the active row of all partitions */ _update_all_active_rows(); return; }