static int wait_job_complete (flux_t h) { int rc = -1; sig_flux_h = h; wjctx_t *ctx = getctx (h); if (signal (SIGINT, sig_handler) == SIG_ERR) goto done; if (jsc_notify_status (h, waitjob_cb, (void *)h) != 0) { flux_log (h, LOG_ERR, "failed to register a waitjob CB"); } /* once jsc_notify_status is returned, all of JSC events * will be queued and delivered. It is safe to signal * readiness. */ if (ctx->start) touch_outfile (ctx->start); if (complete_job (ctx)) { if (ctx->complete) touch_outfile (ctx->complete); flux_log (ctx->h, LOG_INFO, "wait_job_complete: completion detected"); } if (flux_reactor_run (flux_get_reactor (h), 0) < 0) { flux_log (h, LOG_ERR, "error in flux_reactor_run"); goto done; } rc = 0; done: return rc; }
// Model io contention that occurred between previous event and the // curr sim time. Remove completed jobs from the list of running jobs static int advance_time (ctx_t *ctx, zhash_t *job_hash) { // TODO: Make this not static? (pass it in?, store it in ctx?) static double curr_time = 0; job_t *job = NULL; int num_jobs = -1; double next_event = -1, next_termination = -1, curr_progress = -1 #if SIMEXEC_IO ,io_penalty = 0, io_percentage = 0; double *job_min_bandwidth = NULL; #else ; #endif zlist_t *running_jobs = ctx->running_jobs; double sim_time = ctx->sim_state->sim_time; while (curr_time < sim_time) { num_jobs = zlist_size (running_jobs); if (num_jobs == 0) { curr_time = sim_time; break; } next_termination = determine_next_termination (ctx, curr_time, job_hash); next_event = ((sim_time < next_termination) || (next_termination < 0)) ? sim_time : next_termination; // min of the two while (num_jobs > 0) { job = zlist_pop (running_jobs); if (job->start_time <= curr_time) { #if SIMEXEC_IO // Get the minimum bandwidth between a resource in the job and // the pfs job_min_bandwidth = get_job_min_from_hash (job_hash, job->id); io_penalty = determine_io_penalty (job->io_rate, *job_min_bandwidth); io_percentage = (io_penalty / (io_penalty + 1)); job->io_time += (next_event - curr_time) * io_percentage; #endif curr_progress = calc_curr_progress (job, next_event); if (curr_progress < 1) zlist_append (running_jobs, job); else complete_job (ctx, job, next_event); } else { zlist_append (running_jobs, job); } num_jobs--; } curr_time = next_event; } return 0; }
// Remove completed jobs from the list of running jobs // Update sched timer as necessary (to trigger an event in sched) // Also change the state of the job in the KVS static int handle_completed_jobs (ctx_t *ctx) { double curr_progress; zlist_t *running_jobs = ctx->running_jobs; job_t *job = NULL; int num_jobs = zlist_size (running_jobs); double sim_time = ctx->sim_state->sim_time; // print_next_completing (running_jobs, ctx); while (num_jobs > 0) { job = zlist_pop (running_jobs); if (job->execution_time > 0) { curr_progress = calc_curr_progress (job, ctx->sim_state->sim_time); } else { curr_progress = 1; flux_log (ctx->h, LOG_DEBUG, "handle_completed_jobs found a job (%d) with execution " "time <= 0 (%f), setting progress = 1", job->id, job->execution_time); } if (curr_progress < 1) { zlist_append (running_jobs, job); } else { flux_log (ctx->h, LOG_DEBUG, "handle_completed_jobs found a completed job"); complete_job (ctx, job, sim_time); } num_jobs--; } return 0; }