void req_jobcredential( struct batch_request *preq) /* ptr to the decoded request */ { job *pj; pj = locate_new_job(preq->rq_conn, NULL); if (pj == NULL) { req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } reply_ack(preq); return; } /* END req_jobcredential() */
void req_commit( struct batch_request *preq) /* I */ { job *pj; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "starting job execution"); } start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, 0, tmpLine); } else { reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit); } job_save(pj, SAVEJOB_FULL); /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void req_rdytocommit( struct batch_request *preq) /* I */ { job *pj; int sock = preq->rq_conn; int OrigState; int OrigSState; char OrigSChar; long OrigFlags; pj = locate_new_job(sock, preq->rq_ind.rq_rdytocommit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job"); } if (pj == NULL) { log_err(errno, "req_rdytocommit", "unknown job id"); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); /* FAILURE */ return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN) { log_err(errno, "req_rdytocommit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); /* FAILURE */ return; } OrigState = pj->ji_qs.ji_state; OrigSState = pj->ji_qs.ji_substate; OrigSChar = pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char; OrigFlags = pj->ji_wattr[(int)JOB_ATR_state].at_flags; pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = 'T'; pj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET; if (job_save(pj, SAVEJOB_NEW) == -1) { char tmpLine[1024]; sprintf(tmpLine, "cannot save job - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", tmpLine); /* commit failed, backoff state changes */ pj->ji_qs.ji_state = OrigState; pj->ji_qs.ji_substate = OrigSState; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = OrigSChar; pj->ji_wattr[(int)JOB_ATR_state].at_flags = OrigFlags; req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); /* FAILURE */ return; } /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_RdytoCom) != 0) { /* reply failed, purge the job and close the connection */ sprintf(log_buffer, "cannot report jobid - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", log_buffer); close_conn(sock); job_purge(pj); /* FAILURE */ return; } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job completed"); } return; } /* END req_rdytocommit() */
void req_mvjobfile( struct batch_request *preq) /* I */ { int fds; enum job_file jft; int oflag; job *pj; struct passwd *pwd; jft = (enum job_file)preq->rq_ind.rq_jobfile.rq_type; if (preq->rq_ind.rq_jobfile.rq_sequence == 0) oflag = O_CREAT | O_WRONLY | O_TRUNC; else oflag = O_CREAT | O_WRONLY | O_APPEND; pj = locate_new_job(preq->rq_conn, NULL); if (pj == NULL) pj = find_job(preq->rq_ind.rq_jobfile.rq_jobid); if (pj == NULL) { snprintf(log_buffer, 1024, "cannot find job %s for move of %s file", preq->rq_ind.rq_jobfile.rq_jobid, TJobFileType[jft]); log_err(-1, "req_mvjobfile", log_buffer); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if ((pj->ji_grpcache == NULL) && (check_pwd(pj) == NULL)) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (((pwd = getpwnam(pj->ji_wattr[(int)JOB_ATR_euser].at_val.at_str)) == NULL) || ((fds = open_std_file(pj, jft, oflag, pwd->pw_gid)) < 0)) { /* FAILURE */ req_reject(PBSE_MOMREJECT, 0, preq, NULL, "password lookup failed"); return; } if (write( fds, preq->rq_ind.rq_jobfile.rq_data, preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot create file"); } else { reply_ack(preq); } close(fds); if (LOGLEVEL >= 6) { sprintf(log_buffer, "successfully moved %s file for job '%s'", TJobFileType[jft], preq->rq_ind.rq_jobfile.rq_jobid); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", log_buffer); } return; } /* END req_mvjobfile() */
void req_jobscript( struct batch_request *preq) /* ptr to the decoded request*/ { char *id = "req_jobscript"; int fds; char namebuf[MAXPATHLEN]; job *pj; int filemode = 0700; extern char mom_host[]; errno = 0; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_jobfile.rq_jobid); if (pj == NULL) { log_err(errno, id, "cannot locate new job"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* what is the difference between JOB_SUBSTATE_TRANSIN and TRANSICM? */ if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN) { if (errno == 0) { sprintf(log_buffer, "job %s in unexpected state '%s'", pj->ji_qs.ji_jobid, PJobSubState[pj->ji_qs.ji_substate]); } else { sprintf(log_buffer, "job %s in unexpected state '%s' (errno=%d - %s)", pj->ji_qs.ji_jobid, PJobSubState[pj->ji_qs.ji_substate], errno, strerror(errno)); } log_err(errno, id, log_buffer); req_reject(PBSE_IVALREQ, 0, preq, mom_host, log_buffer); return; } /* mom - if job has been checkpointed, discard script,already have it */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { /* SUCCESS - do nothing, ignore script */ reply_ack(preq); return; } strcpy(namebuf, path_jobs); strcat(namebuf, pj->ji_qs.ji_fileprefix); strcat(namebuf, JOB_SCRIPT_SUFFIX); if (pj->ji_qs.ji_un.ji_newt.ji_scriptsz == 0) { /* NOTE: fail is job script already exists */ fds = open(namebuf, O_WRONLY | O_CREAT | O_EXCL | O_Sync, filemode); } else { fds = open(namebuf, O_WRONLY | O_APPEND | O_Sync, filemode); } if (fds < 0) { char tmpLine[1024]; snprintf(tmpLine, sizeof(tmpLine), "cannot open '%s' errno=%d - %s", namebuf, errno, strerror(errno)); /* FAILURE */ /* NOTE: log_err may modify errno */ log_err(errno, id, msg_script_open); req_reject(PBSE_INTERNAL, 0, preq, mom_host, tmpLine); return; } if (write( fds, preq->rq_ind.rq_jobfile.rq_data, (unsigned)preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size) { /* FAILURE */ log_err(errno, id, msg_script_write); req_reject(PBSE_INTERNAL, 0, preq, mom_host, "cannot write job command file"); close(fds); return; } close(fds); pj->ji_qs.ji_un.ji_newt.ji_scriptsz += preq->rq_ind.rq_jobfile.rq_size; /* job has a script file */ pj->ji_qs.ji_svrflags = (pj->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_SCRIPT; /* SUCCESS */ reply_ack(preq); return; } /* END req_jobscript() */
void req_commit( struct batch_request *preq) /* I */ { unsigned int momport = 0; int rc; job *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); alljobs_list.push_back(pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:starting job execution"); rc = start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, tmpLine); } reply_text(preq, rc, tmpLine); } else { reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text); } if (multi_mom) { momport = pbs_rm_port; } job_save(pj, SAVEJOB_FULL, momport); #ifdef NVIDIA_GPUS /* * Does this job have a gpuid assigned? * if so, then update gpu status */ if ((use_nvidia_gpu) && ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new pbs_attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void req_mvjobfile( struct batch_request *preq) /* I */ { int fds; enum job_file jft; int oflag; job *pj; struct passwd *pwd; char *buf = NULL; jft = (enum job_file)preq->rq_ind.rq_jobfile.rq_type; if (preq->rq_ind.rq_jobfile.rq_sequence == 0) oflag = O_CREAT | O_WRONLY | O_TRUNC; else oflag = O_CREAT | O_WRONLY | O_APPEND; pj = locate_new_job(preq->rq_conn, NULL); if (pj == NULL) pj = mom_find_job(preq->rq_ind.rq_jobfile.rq_jobid); if (pj == NULL) { snprintf(log_buffer, 1024, "cannot find job %s for move of %s file", preq->rq_ind.rq_jobfile.rq_jobid, TJobFileType[jft]); log_err(-1, __func__, log_buffer); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } bool good; good = check_pwd(pj); if ((pj->ji_grpcache == NULL) && (good == false)) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } /* check_pwd allocated pwd and getpwnam_ext is going to allocate another one. Free pwd first */ if ((pwd = getpwnam_ext(&buf, pj->ji_wattr[JOB_ATR_euser].at_val.at_str)) == NULL) { /* FAILURE */ req_reject(PBSE_MOMREJECT, 0, preq, NULL, "password lookup failed"); return; } if ((fds = open_std_file(pj, jft, oflag, pwd->pw_gid)) < 0) { int keeping = 1; char *path = std_file_name(pj, jft, &keeping); snprintf(log_buffer,sizeof(log_buffer), "Cannot create file %s", path); req_reject(PBSE_SYSTEM, 0, preq, NULL, log_buffer); if (pwd) { free_pwnam(pwd, buf); } return; } if (pwd) { free_pwnam(pwd, buf); } if (write_ac_socket( fds, preq->rq_ind.rq_jobfile.rq_data, preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot create file"); } else { if (LOGLEVEL >= 6) { sprintf(log_buffer, "successfully moved %s file for job '%s'", TJobFileType[jft], preq->rq_ind.rq_jobfile.rq_jobid); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, log_buffer); } reply_ack(preq); } close(fds); return; } /* END req_mvjobfile() */