/* For each running job, return power allocation/use information in a List * containing elements of type power_by_job_t. * NOTE: Job data structure must be locked on function entry * NOTE: Call list_delete() to free return value * NOTE: This function is currently unused. */ extern List get_job_power(List job_list, struct node_record *node_record_table_ptr) { struct node_record *node_ptr; struct job_record *job_ptr; ListIterator job_iterator; power_by_job_t *power_ptr; char jobid_buf[64] = ""; int i, i_first, i_last; uint64_t debug_flag = slurm_get_debug_flags(); List job_power_list = list_create(_job_power_del); time_t now = time(NULL); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (!IS_JOB_RUNNING(job_ptr)) continue; power_ptr = xmalloc(sizeof(power_by_job_t)); power_ptr->job_id = job_ptr->job_id; power_ptr->start_time = job_ptr->start_time; list_append(job_power_list, power_ptr); if (!job_ptr->node_bitmap) { error("%s: %s node_bitmap is NULL", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); continue; } i_first = bit_ffs(job_ptr->node_bitmap); if (i_first < 0) continue; i_last = bit_fls(job_ptr->node_bitmap); for (i = i_first; i <= i_last; i++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; if (node_ptr->power) { power_ptr->alloc_watts += node_ptr->power->cap_watts; } if (node_ptr->energy) { power_ptr->used_watts += node_ptr->energy->current_watts; } } if (debug_flag & DEBUG_FLAG_POWER) { info("%s: %s Age=%ld(sec) AllocWatts=%u UsedWatts=%u", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), (long int) difftime(now, power_ptr->start_time), power_ptr->alloc_watts, power_ptr->used_watts); } } list_iterator_destroy(job_iterator); return job_power_list; }
/* * Trigger a job's burst buffer stage-out to begin * * Returns a SLURM errno. */ extern int bb_p_job_start_stage_out(struct job_record *job_ptr) { //FIXME: How to handle various job terminate states (e.g. requeue, failure), user script controlled? //FIXME: Test for memory leaks bb_alloc_t *bb_ptr; char **script_argv, *resp; int i, status = 0; char jobid_buf[32]; if (bb_state.bb_config.debug_flag) { info("%s: %s: %s", plugin_type, __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } if (!bb_state.bb_config.start_stage_out) return SLURM_ERROR; if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || (_get_bb_size(job_ptr) == 0)) return SLURM_SUCCESS; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (!bb_ptr) { /* No job buffers. Assuming use of persistent buffers only */ debug("%s: %s bb_rec not found", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } else { script_argv = _build_stage_args(bb_state.bb_config.start_stage_out, "start_stage_out", job_ptr, bb_ptr->size); if (script_argv) { bb_ptr->state = BB_STATE_STAGING_OUT; bb_ptr->state_time = time(NULL); resp = bb_run_script("StartStageOut", bb_state.bb_config.start_stage_out, script_argv, -1, &status); if (resp) { error("%s: StartStageOut: %s", __func__, resp); xfree(resp); } for (i = 0; script_argv[i]; i++) xfree(script_argv[i]); xfree(script_argv); } else { bb_ptr->state = BB_STATE_STAGED_OUT; bb_ptr->state_time = time(NULL); } } pthread_mutex_unlock(&bb_state.bb_mutex); return SLURM_SUCCESS; }
/* Find a per-job burst buffer record for a specific job. * If not found, return NULL. */ extern bb_alloc_t *bb_find_alloc_rec(bb_state_t *state_ptr, struct job_record *job_ptr) { bb_alloc_t *bb_alloc = NULL; char jobid_buf[32]; xassert(job_ptr); xassert(state_ptr); bb_alloc = state_ptr->bb_ahash[job_ptr->user_id % BB_HASH_SIZE]; while (bb_alloc) { if (bb_alloc->job_id == job_ptr->job_id) { if (bb_alloc->user_id == job_ptr->user_id) { xassert(bb_alloc->magic == BB_ALLOC_MAGIC); return bb_alloc; } error("%s: Slurm state inconsistent with burst " "buffer. %s has UserID mismatch (%u != %u)", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), bb_alloc->user_id, job_ptr->user_id); /* This has been observed when slurmctld crashed and * the job state recovered was missing some jobs * which already had burst buffers configured. */ } bb_alloc = bb_alloc->next; } return bb_alloc; }
/* * Determine if a job's burst buffer stage-out is complete * * RET: 0 - stage-out is underway * 1 - stage-out complete * -1 - fatal error */ extern int bb_p_job_test_stage_out(struct job_record *job_ptr) { bb_alloc_t *bb_ptr; int rc = -1; char jobid_buf[32]; if (bb_state.bb_config.debug_flag) { info("%s: %s: %s", plugin_type, __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || (_get_bb_size(job_ptr) == 0)) return 1; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (!bb_ptr) { /* No job buffers. Assuming use of persistent buffers only */ debug("%s: %s bb_rec not found", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); rc = 1; } else { if (bb_ptr->state < BB_STATE_STAGED_OUT) _load_state(job_ptr->job_id); if (bb_ptr->state == BB_STATE_STAGING_OUT) { rc = 0; } else if (bb_ptr->state == BB_STATE_STAGED_OUT) { if (bb_ptr->size != 0) { //FIXME: VESTIGIAL: Use bb_limit_rem // bb_remove_user_load(bb_ptr, &bb_state); bb_ptr->size = 0; } rc = 1; } else { error("%s: %s bb_state:%u", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), bb_ptr->state); rc = -1; } } pthread_mutex_unlock(&bb_state.bb_mutex); return rc; }
/* * Determine if a job's burst buffer stage-in is complete * job_ptr IN - Job to test * test_only IN - If false, then attempt to allocate burst buffer if possible * * RET: 0 - stage-in is underway * 1 - stage-in complete * -1 - stage-in not started or burst buffer in some unexpected state */ extern int bb_p_job_test_stage_in(struct job_record *job_ptr, bool test_only) { bb_alloc_t *bb_ptr; uint64_t bb_size = 0; int rc = 1; char jobid_buf[32]; if (bb_state.bb_config.debug_flag) { info("%s: %s: %s", plugin_type, __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || ((bb_size = _get_bb_size(job_ptr)) == 0)) return rc; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (!bb_ptr) { debug("%s: %s bb_rec not found", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); rc = -1; if ((test_only == false) && (_test_size_limit(job_ptr, bb_size) == 0)) _alloc_job_bb(job_ptr, bb_size); } else { if (bb_ptr->state < BB_STATE_STAGED_IN) _load_state(job_ptr->job_id); if (bb_ptr->state < BB_STATE_STAGED_IN) { rc = 0; } else if (bb_ptr->state == BB_STATE_STAGED_IN) { rc = 1; } else { error("%s: %s bb_state:%u", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), bb_ptr->state); rc = -1; } } pthread_mutex_unlock(&bb_state.bb_mutex); return rc; }
/* * Terminate any file staging and completely release burst buffer resources * * Returns a SLURM errno. */ extern int bb_p_job_cancel(struct job_record *job_ptr) { bb_alloc_t *bb_ptr; char **script_argv, *resp; int i, status = 0; char jobid_buf[32]; if (bb_state.bb_config.debug_flag) { info("%s: %s: %s", plugin_type, __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } if (!bb_state.bb_config.stop_stage_out) return SLURM_ERROR; if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || (_get_bb_size(job_ptr) == 0)) return SLURM_SUCCESS; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (!bb_ptr) { _stop_stage_out(job_ptr->job_id); } else { script_argv = _build_stage_args(bb_state.bb_config.stop_stage_out, "stop_stage_out", job_ptr, 0); if (script_argv) { bb_ptr->state = BB_STATE_STAGED_OUT; bb_ptr->state_time = time(NULL); resp = bb_run_script("StopStageOut", bb_state.bb_config.stop_stage_out, script_argv, -1, &status); if (resp) { error("%s: StopStageOut: %s", __func__, resp); xfree(resp); } for (i = 0; script_argv[i]; i++) xfree(script_argv[i]); xfree(script_argv); } else { _stop_stage_out(job_ptr->job_id); bb_ptr->cancelled = true; bb_ptr->end_time = 0; bb_ptr->state = BB_STATE_STAGED_OUT; bb_ptr->state_time = time(NULL); } } pthread_mutex_unlock(&bb_state.bb_mutex); return SLURM_SUCCESS; }
static void _alloc_job_bb(struct job_record *job_ptr, uint64_t bb_size) { char **script_argv, *resp; bb_alloc_t *bb_ptr; int i, status = 0; bb_job_t *bb_spec; char jobid_buf[32]; bb_spec = xmalloc(sizeof(bb_job_t)); bb_spec->total_size = bb_size; bb_ptr = bb_alloc_job(&bb_state, job_ptr, bb_spec); xfree(bb_spec); if (bb_state.bb_config.debug_flag) { info("%s: start stage-in %s", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } script_argv = _build_stage_args(bb_state.bb_config.start_stage_in, "start_stage_in", job_ptr, bb_size); if (script_argv) { bb_ptr->state = BB_STATE_STAGING_IN; bb_ptr->state_time = time(NULL); resp = bb_run_script("StartStageIn", bb_state.bb_config.start_stage_in, script_argv, -1, &status); if (resp) { error("%s: StartStageIn: %s", __func__, resp); xfree(resp); } for (i = 0; script_argv[i]; i++) xfree(script_argv[i]); xfree(script_argv); } else { bb_ptr->state = BB_STATE_STAGED_IN; bb_ptr->state_time = time(NULL); } }
/* * For a given job, return our best guess if when it might be able to start */ extern time_t bb_p_job_get_est_start(struct job_record *job_ptr) { bb_alloc_t *bb_ptr; time_t est_start = time(NULL); uint64_t bb_size; int rc; char jobid_buf[32]; if (bb_state.bb_config.debug_flag) { info("%s: %s: %s", plugin_type, __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || ((bb_size = _get_bb_size(job_ptr)) == 0)) return est_start; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (!bb_ptr) { rc = _test_size_limit(job_ptr, bb_size); if (rc == 0) { /* Could start now */ ; } else if (rc == 1) { /* Exceeds configured limits */ est_start += 365 * 24 * 60 * 60; } else { /* No space currently available */ est_start = MAX(est_start, bb_state.next_end_time); } } else if (bb_ptr->state < BB_STATE_STAGED_IN) { est_start++; } pthread_mutex_unlock(&bb_state.bb_mutex); return est_start; }
/* Test if a job can be allocated a burst buffer. * This may preempt currently active stage-in for higher priority jobs. * * RET 0: Job can be started now * 1: Job exceeds configured limits, continue testing with next job * 2: Job needs more resources than currently available can not start, * skip all remaining jobs */ static int _test_size_limit(struct job_record *job_ptr, uint64_t add_space) { burst_buffer_info_msg_t *resv_bb; struct preempt_bb_recs *preempt_ptr = NULL; List preempt_list; ListIterator preempt_iter; uint64_t resv_space = 0; int add_total_space_needed = 0, add_user_space_needed = 0; int add_total_space_avail = 0, add_user_space_avail = 0; time_t now = time(NULL), when; bb_alloc_t *bb_ptr = NULL; int i; char jobid_buf[32]; if (job_ptr->start_time <= now) when = now; else when = job_ptr->start_time; resv_bb = job_test_bb_resv(job_ptr, when); if (resv_bb) { burst_buffer_info_t *resv_bb_ptr; for (i = 0, resv_bb_ptr = resv_bb->burst_buffer_array; i < resv_bb->record_count; i++, resv_bb_ptr++) { if (resv_bb_ptr->name && strcmp(resv_bb_ptr->name, bb_state.name)) continue; resv_bb_ptr->used_space = bb_granularity(resv_bb_ptr->used_space, bb_state.bb_config.granularity); resv_space += resv_bb_ptr->used_space; } slurm_free_burst_buffer_info_msg(resv_bb); } add_total_space_needed = bb_state.used_space + add_space + resv_space - bb_state.total_space; if ((add_total_space_needed <= 0) && (add_user_space_needed <= 0)) return 0; /* Identify candidate burst buffers to revoke for higher priority job */ preempt_list = list_create(bb_job_queue_del); for (i = 0; i < BB_HASH_SIZE; i++) { bb_ptr = bb_state.bb_ahash[i]; while (bb_ptr) { if (bb_ptr->job_id && (bb_ptr->use_time > now) && (bb_ptr->use_time > job_ptr->start_time)) { preempt_ptr = xmalloc(sizeof( struct preempt_bb_recs)); preempt_ptr->bb_ptr = bb_ptr; preempt_ptr->job_id = bb_ptr->job_id; preempt_ptr->size = bb_ptr->size; preempt_ptr->use_time = bb_ptr->use_time; preempt_ptr->user_id = bb_ptr->user_id; list_push(preempt_list, preempt_ptr); add_total_space_avail += bb_ptr->size; if (bb_ptr->user_id == job_ptr->user_id) add_user_space_avail += bb_ptr->size; } bb_ptr = bb_ptr->next; } } if ((add_total_space_avail >= add_total_space_needed) && (add_user_space_avail >= add_user_space_needed)) { list_sort(preempt_list, bb_preempt_queue_sort); preempt_iter = list_iterator_create(preempt_list); while ((preempt_ptr = list_next(preempt_iter)) && (add_total_space_needed || add_user_space_needed)) { if (add_user_space_needed && (preempt_ptr->user_id == job_ptr->user_id)) { _stop_stage_in(preempt_ptr->job_id); preempt_ptr->bb_ptr->cancelled = true; preempt_ptr->bb_ptr->end_time = 0; if (bb_state.bb_config.debug_flag) { info("%s: %s: Preempting stage-in of " "job %u for %s", plugin_type, __func__, preempt_ptr->job_id, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } add_user_space_needed -= preempt_ptr->size; add_total_space_needed -= preempt_ptr->size; } if ((add_total_space_needed > add_user_space_needed) && (preempt_ptr->user_id != job_ptr->user_id)) { _stop_stage_in(preempt_ptr->job_id); preempt_ptr->bb_ptr->cancelled = true; preempt_ptr->bb_ptr->end_time = 0; if (bb_state.bb_config.debug_flag) { info("%s: %s: Preempting stage-in of " "job %u for %s", plugin_type, __func__, preempt_ptr->job_id, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf))); } add_total_space_needed -= preempt_ptr->size; } } list_iterator_destroy(preempt_iter); } FREE_NULL_LIST(preempt_list); return 2; }