extern int select_p_step_finish(struct step_record *step_ptr) { select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data; if (slurmctld_conf.select_type_param & CR_NHC_STEP_NO) { debug3("NHC_No_Steps set not running NHC on steps."); other_step_finish(step_ptr); /* free resources on the job */ post_job_step(step_ptr); return SLURM_SUCCESS; } /* The NHC needs to be ran after each step even if the job is about to run the NHC for the allocation. The NHC developers feel this is needed. If it ever changes just remove the below commented code. */ /* else if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { */ /* debug3("step completion %u.%u was received after job " */ /* "allocation is already completing, no extra NHC needed.", */ /* step_ptr->job_ptr->job_id, step_ptr->step_id); */ /* other_step_finish(step_ptr); */ /* /\* free resources on the job *\/ */ /* post_job_step(step_ptr); */ /* return SLURM_SUCCESS; */ /* } */ jobinfo->cleaning = 1; _spawn_cleanup_thread(step_ptr, _step_fini); return SLURM_SUCCESS; }
static void *_step_fini(void *args) { struct step_record *step_ptr = (struct step_record *)args; select_jobinfo_t *jobinfo = NULL; nhc_info_t nhc_info; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; if (!step_ptr) { error("_step_fini: no step ptr given, " "this should never happen"); return NULL; } memset(&nhc_info, 0, sizeof(nhc_info_t)); nhc_info.step = 1; lock_slurmctld(job_read_lock); nhc_info.jobid = step_ptr->job_ptr->job_id; nhc_info.apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id); nhc_info.exit_code = step_ptr->exit_code; nhc_info.user_id = step_ptr->job_ptr->user_id; if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) { if (step_ptr->job_ptr) nhc_info.nodelist = xstrdup(step_ptr->job_ptr->nodes); } else nhc_info.nodelist = xstrdup(step_ptr->step_layout->node_list); unlock_slurmctld(job_read_lock); /* run NHC */ _run_nhc(&nhc_info); /***********/ xfree(nhc_info.nodelist); lock_slurmctld(job_write_lock); if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) { error("For some reason we don't have a step_node_bitmap or " "a job_ptr for %"PRIu64". This should never happen.", nhc_info.apid); } else { other_step_finish(step_ptr); jobinfo = step_ptr->select_jobinfo->data; jobinfo->cleaning = 0; /* free resources on the job */ post_job_step(step_ptr); } unlock_slurmctld(job_write_lock); return NULL; }
static void *_step_fini(void *args) { struct step_record *step_ptr = (struct step_record *)args; select_jobinfo_t *jobinfo = NULL; uint64_t apid = 0; char *node_list = NULL; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; if (!step_ptr) { error("_step_fini: no step ptr given, " "this should never happen"); return NULL; } lock_slurmctld(job_read_lock); apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id); if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) { if (step_ptr->job_ptr) node_list = xstrdup(step_ptr->job_ptr->nodes); } else node_list = xstrdup(step_ptr->step_layout->node_list); unlock_slurmctld(job_read_lock); /* run NHC */ _run_nhc(apid, node_list, 0); /***********/ xfree(node_list); lock_slurmctld(job_write_lock); if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) { error("For some reason we don't have a step_node_bitmap or " "a job_ptr for %"PRIu64". This should never happen.", apid); } else { other_step_finish(step_ptr); jobinfo = step_ptr->select_jobinfo->data; jobinfo->cleaning = 0; /* free resources on the job */ post_job_step(step_ptr); } unlock_slurmctld(job_write_lock); return NULL; }
extern int select_p_step_finish(struct step_record *step_ptr) { select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data; if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { debug3("step completion %u.%u was received after job " "allocation is already completing, no extra NHC needed.", step_ptr->job_ptr->job_id, step_ptr->step_id); other_step_finish(step_ptr); /* free resources on the job */ post_job_step(step_ptr); return SLURM_SUCCESS; } jobinfo->cleaning = 1; _spawn_cleanup_thread(step_ptr, _step_fini); return SLURM_SUCCESS; }