void req_commit( struct batch_request *preq) /* I */ { job *pj; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "starting job execution"); } start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, 0, tmpLine); } else { reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit); } job_save(pj, SAVEJOB_FULL); /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void req_quejob( struct batch_request *preq) /* ptr to the decoded request */ { char *id = "req_quejob"; char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; if (PBSNodeCheckProlog) { check_state(1); mom_server_all_update_stat(); if (internal_state & INUSE_DOWN) { req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL); return; } } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, id, "request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { strcpy(basename,psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno,id,"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* unlink job from svr_alljobs since it will be placed on newjobs */ delete_link(&pj->ji_alljobs); } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR,1,psatl,preq); return; } pdef = &job_attr_def[index]; /* Is attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ job_purge(pj); reply_badattr(PBSE_ATTRRO,1,psatl,preq); return; } /* decode attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ job_purge(pj); reply_badattr(rc,1,psatl,preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { job_purge(pj); reply_badattr(rc,1,psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { delete_link(&pj->ji_alljobs); append_link(&svr_newjobs,&pj->ji_alljobs,pj); pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ close_conn(sock); job_purge(pj); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END req_quejob() */
void req_rdytocommit( struct batch_request *preq) /* I */ { job *pj; int sock = preq->rq_conn; int OrigState; int OrigSState; char OrigSChar; long OrigFlags; pj = locate_new_job(sock, preq->rq_ind.rq_rdytocommit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job"); } if (pj == NULL) { log_err(errno, "req_rdytocommit", "unknown job id"); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); /* FAILURE */ return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN) { log_err(errno, "req_rdytocommit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); /* FAILURE */ return; } OrigState = pj->ji_qs.ji_state; OrigSState = pj->ji_qs.ji_substate; OrigSChar = pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char; OrigFlags = pj->ji_wattr[(int)JOB_ATR_state].at_flags; pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = 'T'; pj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET; if (job_save(pj, SAVEJOB_NEW) == -1) { char tmpLine[1024]; sprintf(tmpLine, "cannot save job - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", tmpLine); /* commit failed, backoff state changes */ pj->ji_qs.ji_state = OrigState; pj->ji_qs.ji_substate = OrigSState; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = OrigSChar; pj->ji_wattr[(int)JOB_ATR_state].at_flags = OrigFlags; req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); /* FAILURE */ return; } /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_RdytoCom) != 0) { /* reply failed, purge the job and close the connection */ sprintf(log_buffer, "cannot report jobid - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", log_buffer); close_conn(sock); job_purge(pj); /* FAILURE */ return; } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job completed"); } return; } /* END req_rdytocommit() */
void mom_req_quejob( batch_request *preq) /* ptr to the decoded request */ { char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ int resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; memset(basename, 0, sizeof(basename)); if (PBSNodeCheckProlog) { check_state(1); if (internal_state & INUSE_DOWN) { req_reject(PBSE_BADMOMSTATE, 0, preq, NULL, NULL); return; } } if (reject_job_submit == TRUE) { req_reject(-1, 0, preq, NULL, "This mom is configured not to run jobs"); return; } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, __func__, (char *)"request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = mom_find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an pbs_attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { snprintf(basename, sizeof(basename), "%s", psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (basename[0] == '\0') snprintf(basename, sizeof(basename), "%s", jid); if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno, __func__, (char *)"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST, 0, preq, NULL, "job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* reject the job. It is already working here. */ sprintf(log_buffer, "Job already exists. State: %d substate: %d", pj->ji_qs.ji_state, pj->ji_qs.ji_substate); log_err(-1, __func__, log_buffer); sprintf(log_buffer, "Job %s already on mom", pj->ji_qs.ji_jobid); req_reject(PBSE_JOBEXIST, 0, preq, NULL, log_buffer); return; } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; /* changing the union type overwrites the euid for the job, and if * ji_grpcache is set this potentially allows jobs to run as root. Unsetting * ji_grpcache fixes this problem --dbeer */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the pbs_attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ mom_job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR, 1, psatl, preq); return; } pdef = &job_attr_def[index]; /* Is pbs_attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ mom_job_purge(pj); reply_badattr(PBSE_ATTRRO, 1, psatl, preq); return; } /* decode pbs_attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value, resc_access_perm); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ mom_job_purge(pj); reply_badattr(rc, 1, psatl, preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { mom_job_purge(pj); reply_badattr(rc, 1, psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { remove_from_job_list(pj); append_link(&svr_newjobs,&pj->ji_alljobs,pj); if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock, FALSE); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ // call mom_job_purge first so that double-frees don't happen // when the on_close function is called mom_job_purge(pj); close_conn(sock, FALSE); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END mom_req_quejob() */