void req_commit( struct batch_request *preq) /* I */ { unsigned int momport = 0; int rc; job *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "req_commit:starting job execution"); } rc = start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, rc, tmpLine); } else { reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text); } if (multi_mom) { momport = pbs_rm_port; } job_save(pj, SAVEJOB_FULL, momport); #ifdef NVIDIA_GPUS /* * Does this job have a gpuid assigned? * if so, then update gpu status */ if ((use_nvidia_gpu) && ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new pbs_attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void mom_job_purge( job *pjob) /* I (modified) */ { job_file_delete_info *jfdi; jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info)); if (jfdi == NULL) { log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion"); return; } #ifdef NVIDIA_GPUS /* * Did this job have a gpuid assigned? * if so, then update gpu status */ if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* initialize struct information */ if (pjob->ji_flags & MOM_HAS_TMPDIR) { jfdi->has_temp_dir = TRUE; pjob->ji_flags &= ~MOM_HAS_TMPDIR; } else jfdi->has_temp_dir = FALSE; strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid); strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix); if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)) jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str); jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid; jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid; if (thread_unlink_calls == TRUE) enqueue_threadpool_request(delete_job_files,jfdi); else delete_job_files(jfdi); /* remove this job from the global queue */ delete_link(&pjob->ji_jobque); delete_link(&pjob->ji_alljobs); if (LOGLEVEL >= 6) { sprintf(log_buffer,"removing job"); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } #if IBM_SP2==2 /* IBM SP PSSP 3.1 */ unload_sp_switch(pjob); #endif /* IBM SP */ mom_job_free(pjob); /* if no jobs are left, check if MOM should be restarted */ if (((job *)GET_NEXT(svr_alljobs)) == NULL) MOMCheckRestart(); return; } /* END mom_job_purge() */
void mom_job_purge( job *pjob) /* I (modified) */ { job_file_delete_info *jfdi; jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info)); if (jfdi == NULL) { log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion"); return; } #ifdef NVIDIA_GPUS /* * Did this job have a gpuid assigned? * if so, then update gpu status */ if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* initialize struct information */ if (pjob->ji_flags & MOM_HAS_TMPDIR) { jfdi->has_temp_dir = TRUE; pjob->ji_flags &= ~MOM_HAS_TMPDIR; } else jfdi->has_temp_dir = FALSE; strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid); strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix); if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)) jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str); jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid; jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid; /* remove each pid in ji_job_pid_set from the global_job_sid_set */ for (job_pid_set_t::const_iterator job_pid_set_iter = pjob->ji_job_pid_set->begin(); job_pid_set_iter != pjob->ji_job_pid_set->end(); job_pid_set_iter++) { /* get pid entry from ji_job_pid_set */ pid_t job_pid = *job_pid_set_iter; /* see if job_pid exists in job_sid set */ job_pid_set_t::const_iterator it = global_job_sid_set.find(job_pid); if (it != global_job_sid_set.end()) { /* remove job_pid from the set */ global_job_sid_set.erase(it); } } if (thread_unlink_calls == TRUE) enqueue_threadpool_request(delete_job_files, jfdi, request_pool); else delete_job_files(jfdi); /* remove this job from the global queue */ delete_link(&pjob->ji_jobque); delete_link(&pjob->ji_alljobs); remove_from_exiting_list(pjob); if (LOGLEVEL >= 6) { sprintf(log_buffer,"removing job"); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } #if IBM_SP2==2 /* IBM SP PSSP 3.1 */ unload_sp_switch(pjob); #endif /* IBM SP */ //We had a request to change the frequency for the job and now that the job is done //we want to change the frequency back. resource *presc = find_resc_entry(&pjob->ji_wattr[JOB_ATR_resource], find_resc_def(svr_resc_def, "cpuclock", svr_resc_size)); if (presc != NULL) { std::string beforeFreq; nd_frequency.get_frequency_string(beforeFreq); if(!nd_frequency.restore_frequency()) { std::string msg = "Failed to restore frequency."; log_ext(nd_frequency.get_last_error(),__func__,msg.c_str(),LOG_ERR); } else { std::string afterFreq; nd_frequency.get_frequency_string(afterFreq); std::string msg = "Restored frequency from " + beforeFreq + " to " + afterFreq; log_ext(PBSE_CHANGED_CPU_FREQUENCY,__func__, msg.c_str(),LOG_NOTICE); } } mom_job_free(pjob); /* if no jobs are left, check if MOM should be restarted */ if (((job *)GET_NEXT(svr_alljobs)) == NULL) MOMCheckRestart(); return; } /* END mom_job_purge() */