void ensure_deleted( struct work_task *ptask) /* I */ { struct batch_request *preq; job *pjob; preq = ptask->wt_parm1; if ((pjob = find_job(preq->rq_ind.rq_delete.rq_objname)) == NULL) { /* job doesn't exist, we're done */ return; } sprintf(log_buffer, "purging job without checking MOM"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); free_nodes(pjob); if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { set_resc_assigned(pjob, DECR); } job_purge(pjob); } /* END ensure_deleted() */
static void close_quejob( int sfds) { job *pjob; job *npjob; pjob = (job *)GET_NEXT(svr_newjobs); while (pjob != NULL) { npjob = GET_NEXT(pjob->ji_alljobs); if (pjob->ji_qs.ji_un.ji_newt.ji_fromsock == sfds) { if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_TRANSICM) { #ifndef PBS_MOM if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) { /* * the job was being created here for the first time * go ahead and enqueue it as QUEUED; otherwise, hold * it here as TRANSICM until we hear from the sending * server again to commit. */ delete_link(&pjob->ji_alljobs); pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; if (svr_enquejob(pjob)) job_abt(&pjob, msg_err_noqueue); } #endif /* PBS_MOM */ } else { /* else delete the job */ delete_link(&pjob->ji_alljobs); job_purge(pjob); } break; } /* END if (..) */ pjob = npjob; } return; } /* END close_quejob() */
/* delete a job array struct from memory and disk. This is used when the number * of jobs that belong to the array becomes zero. * returns zero if there are no errors, non-zero otherwise */ int array_delete(job_array *pa) { char path[MAXPATHLEN + 1]; array_request_node *rn; /* first thing to do is take this out of the servers list of all arrays */ delete_link(&pa->all_arrays); /* delete the on disk copy of the struct */ strcpy(path, path_arrays); strcat(path, pa->ai_qs.fileprefix); strcat(path, ARRAY_FILE_SUFFIX); if (unlink(path)) { sprintf(log_buffer, "unable to delete %s", path); log_err(errno, "array_delete", log_buffer); } /* clear array request linked list */ for (rn = (array_request_node*)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node*)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } /* free the memory for the job pointers */ free(pa->jobs); /* purge the "template" job, this also deletes the shared script file for the array*/ if (pa->template_job) { job_purge(pa->template_job); } /* free the memory allocated for the struct */ free(pa); return 0; }
/** * @brief * force_reque - requeue (rerun) a job * * @param[in,out] pwt - job which needs to be rerun */ void force_reque(job *pjob) { int newstate; int newsubstate; pjob->ji_modified = 1; pjob->ji_momhandle = -1; pjob->ji_mom_prot = PROT_INVALID; /* simulate rerun: free nodes, clear checkpoint flag, and */ /* clear exec_vnode string */ rel_resc(pjob); /* note in accounting file */ account_jobend(pjob, pjob->ji_acctrec, PBS_ACCT_RERUN); /* if a subjob, we set substate to RERUN3 to cause trktbl entry */ /* to be reset to Qeued, and then blow away the job struct */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3; job_purge(pjob); return; } /* * Clear any JOB_SVFLG_Actsuspd flag too, as the job is no longer * suspended (User busy). A suspended job is rerun in case of a * MOM failure after the workstation becomes active(busy). */ pjob->ji_qs.ji_svrflags &= ~(JOB_SVFLG_Actsuspd | JOB_SVFLG_StagedIn | JOB_SVFLG_CHKPT); job_attr_def[(int)JOB_ATR_exec_host].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_host]); job_attr_def[(int)JOB_ATR_exec_host2].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_host2]); job_attr_def[(int)JOB_ATR_exec_vnode].at_free( &pjob->ji_wattr[(int)JOB_ATR_exec_vnode]); job_attr_def[(int)JOB_ATR_pset].at_free( &pjob->ji_wattr[(int)JOB_ATR_pset]); /* job dir has no meaning for re-queued jobs, so unset it */ job_attr_def[(int)JOB_ATR_jobdir].at_free(&pjob-> ji_wattr[(int)JOB_ATR_jobdir]); svr_evaljobstate(pjob, &newstate, &newsubstate, 1); (void)svr_setjobstate(pjob, newstate, newsubstate); }
void req_rdytocommit( struct batch_request *preq) /* I */ { job *pj; int sock = preq->rq_conn; int OrigState; int OrigSState; char OrigSChar; long OrigFlags; pj = locate_new_job(sock, preq->rq_ind.rq_rdytocommit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job"); } if (pj == NULL) { log_err(errno, "req_rdytocommit", "unknown job id"); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); /* FAILURE */ return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN) { log_err(errno, "req_rdytocommit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); /* FAILURE */ return; } OrigState = pj->ji_qs.ji_state; OrigSState = pj->ji_qs.ji_substate; OrigSChar = pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char; OrigFlags = pj->ji_wattr[(int)JOB_ATR_state].at_flags; pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = 'T'; pj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET; if (job_save(pj, SAVEJOB_NEW) == -1) { char tmpLine[1024]; sprintf(tmpLine, "cannot save job - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", tmpLine); /* commit failed, backoff state changes */ pj->ji_qs.ji_state = OrigState; pj->ji_qs.ji_substate = OrigSState; pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = OrigSChar; pj->ji_wattr[(int)JOB_ATR_state].at_flags = OrigFlags; req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); /* FAILURE */ return; } /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_RdytoCom) != 0) { /* reply failed, purge the job and close the connection */ sprintf(log_buffer, "cannot report jobid - errno=%d - %s", errno, strerror(errno)); log_err(errno, "req_rdytocommit", log_buffer); close_conn(sock); job_purge(pj); /* FAILURE */ return; } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "ready to commit job completed"); } return; } /* END req_rdytocommit() */
void req_quejob( struct batch_request *preq) /* ptr to the decoded request */ { char *id = "req_quejob"; char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; if (PBSNodeCheckProlog) { check_state(1); mom_server_all_update_stat(); if (internal_state & INUSE_DOWN) { req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL); return; } } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, id, "request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { strcpy(basename,psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno,id,"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* unlink job from svr_alljobs since it will be placed on newjobs */ delete_link(&pj->ji_alljobs); } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR,1,psatl,preq); return; } pdef = &job_attr_def[index]; /* Is attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ job_purge(pj); reply_badattr(PBSE_ATTRRO,1,psatl,preq); return; } /* decode attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ job_purge(pj); reply_badattr(rc,1,psatl,preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { job_purge(pj); reply_badattr(rc,1,psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { delete_link(&pj->ji_alljobs); append_link(&svr_newjobs,&pj->ji_alljobs,pj); pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ close_conn(sock); job_purge(pj); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END req_quejob() */
int setup_array_struct(job *pjob) { job_array *pa; /* struct work_task *wt; */ array_request_node *rn; int bad_token_count; int array_size; int rc; /* setup a link to this job array in the servers all_arrays list */ pa = (job_array *)calloc(1,sizeof(job_array)); pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; pa->template_job = pjob; /*pa->ai_qs.array_size = pjob->ji_wattr[(int)JOB_ATR_job_array_size].at_val.at_long;*/ strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid); strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix); strncpy(pa->ai_qs.owner, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, PBS_MAXUSER + PBS_MAXSERVERNAME + 2); strncpy(pa->ai_qs.submit_host, get_variable(pjob, pbs_o_host), PBS_MAXSERVERNAME); pa->ai_qs.num_cloned = 0; CLEAR_LINK(pa->all_arrays); CLEAR_HEAD(pa->request_tokens); append_link(&svr_jobarrays, &pa->all_arrays, (void*)pa); if (job_save(pjob, SAVEJOB_FULL, 0) != 0) { job_purge(pjob); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", "cannot save job"); } return 1; } if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa))) { array_delete(pa); snprintf(log_buffer,sizeof(log_buffer), "Array %s requested a slot limit above the max limit %ld, rejecting\n", pa->ai_qs.parent_id, server.sv_attr[SRV_ATR_MaxSlotLimit].at_val.at_long); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buffer); return(INVALID_SLOT_LIMIT); } pa->ai_qs.jobs_running = 0; pa->ai_qs.num_started = 0; pa->ai_qs.num_failed = 0; pa->ai_qs.num_successful = 0; bad_token_count = parse_array_request(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, &(pa->request_tokens)); /* get the number of elements that should be allocated in the array */ rn = (array_request_node *)GET_NEXT(pa->request_tokens); array_size = 0; pa->ai_qs.num_jobs = 0; while (rn != NULL) { if (rn->end > array_size) array_size = rn->end; /* calculate the actual number of jobs (different from array size) */ pa->ai_qs.num_jobs += rn->end - rn->start + 1; rn = (array_request_node *)GET_NEXT(rn->request_tokens_link); } /* size of array is the biggest index + 1 */ array_size++; if (server.sv_attr[SRV_ATR_MaxArraySize].at_flags & ATR_VFLAG_SET) { int max_array_size = server.sv_attr[SRV_ATR_MaxArraySize].at_val.at_long; if (max_array_size < pa->ai_qs.num_jobs) { array_delete(pa); return(ARRAY_TOO_LARGE); } } /* initialize the array */ pa->jobs = malloc(array_size * sizeof(job *)); memset(pa->jobs,0,array_size * sizeof(job *)); /* remember array_size */ pa->ai_qs.array_size = array_size; CLEAR_HEAD(pa->ai_qs.deps); array_save(pa); if (bad_token_count > 0) { array_delete(pa); return 2; } return 0; }
/** * @brief * create_subjob - create a Subjob from the parent Array Job * Certain attributes are changed or left out * @param[in] parent - pointer to parent Job * @param[in] newjid - new job id * @param[in] rc - return code * @return pointer to new job * @retval NULL - error */ job * create_subjob(job *parent, char *newjid, int *rc) { pbs_list_head attrl; int i; int j; int indx; char *index; attribute_def *pdef; attribute *ppar; attribute *psub; svrattrl *psatl; job *subj; long eligibletime; long time_msec; #ifdef WIN32 struct _timeb tval; #else struct timeval tval; #endif if ((parent->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) == 0) { *rc = PBSE_IVALREQ; return NULL; /* parent not an array job */ } /* find and copy the index */ if ((index = get_index_from_jid(newjid)) == NULL) { *rc = PBSE_IVALREQ; return NULL; } if ((indx = subjob_index_to_offset(parent, index)) == -1) { *rc = PBSE_UNKJOBID; return NULL; } if (parent->ji_ajtrk->tkm_tbl[indx].trk_status != JOB_STATE_QUEUED) { *rc = PBSE_BADSTATE; return NULL; } /* * allocate and clear basic structure * cannot copy job attributes because cannot share strings and other * malloc-ed data, so copy ji_qs as a whole and then copy the * non-saved items before ji_qs. */ subj = job_alloc(); subj->ji_qs = parent->ji_qs; /* copy the fixed save area */ #ifdef PBS_CRED_GRIDPROXY subj->ji_gsscontext = parent->ji_gsscontext; #endif subj->ji_qhdr = parent->ji_qhdr; subj->ji_resvp = parent->ji_resvp; subj->ji_myResv = parent->ji_myResv; subj->ji_parentaj = parent; strcpy(subj->ji_qs.ji_jobid, newjid); /* replace job id */ *subj->ji_qs.ji_fileprefix = '\0'; subj->ji_subjindx = indx; /* * now that is all done, copy the required attributes by * encoding and then decoding into the new array. Then add the * subjob specific attributes. */ resc_access_perm = ATR_DFLAG_ACCESS; CLEAR_HEAD(attrl); for (i = 0; attrs_to_copy[i] != JOB_ATR_LAST; i++) { j = (int)attrs_to_copy[i]; ppar = &parent->ji_wattr[j]; psub = &subj->ji_wattr[j]; pdef = &job_attr_def[j]; if (pdef->at_encode(ppar, &attrl, pdef->at_name, NULL, ATR_ENCODE_MOM, &psatl) > 0) { for (psatl = (svrattrl *)GET_NEXT(attrl); psatl; psatl = ((svrattrl *)GET_NEXT(psatl->al_link))) { pdef->at_decode(psub, psatl->al_name, psatl->al_resc, psatl->al_value); } /* carry forward the default bit if set */ psub->at_flags |= (ppar->at_flags & ATR_VFLAG_DEFLT); free_attrlist(&attrl); } } psub = &subj->ji_wattr[(int)JOB_ATR_array_id]; job_attr_def[(int)JOB_ATR_array_id].at_decode(psub, NULL, NULL, parent->ji_qs.ji_jobid); psub = &subj->ji_wattr[(int)JOB_ATR_array_index]; job_attr_def[(int)JOB_ATR_array_index].at_decode(psub, NULL, NULL, index); /* Lastly, set or clear a few flags and link in the structure */ subj->ji_qs.ji_svrflags &= ~JOB_SVFLG_ArrayJob; subj->ji_qs.ji_svrflags |= JOB_SVFLG_SubJob; subj->ji_modified = 1; /* ** will likely take this out ** */ subj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM; (void)svr_setjobstate(subj, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED); subj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET; subj->ji_wattr[(int)JOB_ATR_substate].at_flags |= ATR_VFLAG_SET; /* subjob needs to borrow eligible time from parent job array. * expecting only to accrue eligible_time and nothing else. */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) { eligibletime = parent->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long; if (parent->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) eligibletime += subj->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long - parent->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long; subj->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long = eligibletime; subj->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; } #ifdef WIN32 _ftime_s(&tval); time_msec = (tval.time * 1000L) + tval.millitm; #else gettimeofday(&tval, NULL); time_msec = (tval.tv_sec * 1000L) + (tval.tv_usec/1000L); #endif /* set the queue rank attribute */ subj->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = time_msec; subj->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE; if (svr_enquejob(subj) != 0) { job_purge(subj); *rc = PBSE_IVALREQ; return NULL; } *rc = PBSE_NONE; return subj; }
static int forced_jobpurge( struct batch_request *preq) { job *pjob; if ((pjob = find_job(preq->rq_ind.rq_delete.rq_objname)) == NULL) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return(-1); } /* check about possibly purging the job */ if (preq->rq_extend != NULL) { if (!strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { if (((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) != 0) || ((svr_chk_owner(preq, pjob) == 0) && (server.sv_attr[SRV_ATR_OwnerPurge].at_val.at_long))) { sprintf(log_buffer, "purging job without checking MOM"); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); reply_ack(preq); free_nodes(pjob); if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { set_resc_assigned(pjob, DECR); } job_purge(pjob); return(1); } else { /* FAILURE */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); return(-1); } } } return(0); } /* END forced_jobpurge() */
/** * @brief * post_movejob - clean up action for child started in net_move/send_job * to "move" a job to another server * @par * If move was successfull, delete server's copy of thejob structure, * and reply to request. * @par * If route didn't work, reject the request. * * @param[in] pwt - work task structure * * @return none. */ static void post_movejob(struct work_task *pwt) { char *id = "post_movejob"; struct batch_request *req; int newstate; int newsub; int stat; int r; job *jobp; req = (struct batch_request *)pwt->wt_parm1; stat = pwt->wt_aux; pbs_errno = PBSE_NONE; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buffer, "bad request type %d", req->rq_type); log_err(-1, __func__, log_buffer); return; } jobp = find_job(req->rq_ind.rq_move.rq_jid); if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm2)) { sprintf(log_buffer, "job %s not found", req->rq_ind.rq_move.rq_jid); log_err(-1, __func__, log_buffer); } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); if (r == SEND_JOB_OK) { /* purge server's job structure */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); (void)strcpy(log_buffer, msg_movejob); (void)sprintf(log_buffer+strlen(log_buffer), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); /* * If server is configured to keep job history info and * the job is created here, then keep the job struture * for history purpose without purging. No need to check * for sub-jobs as sub jobs can't be moved. */ if (svr_chk_history_conf()) svr_setjob_histinfo(jobp, T_MOV_JOB); else job_purge(jobp); } else r = PBSE_ROUTEREJ; } else { r = PBSE_SYSTEM; (void)sprintf(log_buffer, msg_badexit, stat); (void)strcat(log_buffer, __func__); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); } if (r) { if (jobp) { /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } req_reject(r, 0, req); } else reply_ack(req); return; }
static void post_job_delete_nanny( struct work_task *pwt) { struct batch_request *preq_sig; /* signal request to MOM */ int rc; job *pjob; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long) { /* the admin disabled nanny within the last minute or so */ release_req(pwt); return; } /* extract job id from task */ pjob = find_job(preq_sig->rq_ind.rq_signal.rq_jid); if (pjob == NULL) { sprintf(log_buffer, "job delete nanny: the job disappeared (this is a BUG!)"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buffer, "job delete nanny returned, but does not exist on mom"); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, preq_sig->rq_ind.rq_signal.rq_jid, log_buffer); free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); } /* free task */ release_req(pwt); return; } /* END post_job_delete_nanny() */
job * chk_job_request(char *jobid, struct batch_request *preq, int *rc) { int t; int histerr = 0; job *pjob; int deletehist = 0; char *p1; char *p2; if (preq->rq_extend && strstr(preq->rq_extend, DELETEHISTORY)) deletehist = 1; t = is_job_array(jobid); if ((t == IS_ARRAY_NO) || (t == IS_ARRAY_ArrayJob)) pjob = find_job(jobid); /* regular or ArrayJob itself */ else pjob = find_arrayparent(jobid); /* subjob(s) */ *rc = t; if (pjob == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, msg_unkjobid); req_reject(PBSE_UNKJOBID, 0, preq); return NULL; } else { histerr = svr_chk_histjob(pjob); if (histerr && deletehist == 0) { req_reject(histerr, 0, preq); return NULL; } if (deletehist ==1&& pjob->ji_qs.ji_state == JOB_STATE_MOVED && pjob->ji_qs.ji_substate != JOB_SUBSTATE_FINISHED) { job_purge(pjob); req_reject(PBSE_UNKJOBID, 0, preq); return NULL; } } /* * The job was found using the job ID in the request, but it may not * match exactly (i.e. FQDN vs. unqualified hostname). Overwrite the * host portion of the job ID in the request with the host portion of * the one from the server job structure. Do not modify anything * before the first dot in the job ID because it may be an array job. * This will allow find_job() to look for an exact match when the * request is serviced by MoM. */ p1 = strchr(pjob->ji_qs.ji_jobid, '.'); if (p1) { p2 = strchr(jobid, '.'); if (p2) *p2 = '\0'; strncat(jobid, p1, PBS_MAXSVRJOBID-1); } if (svr_authorize_jobreq(preq, pjob) == -1) { (void)sprintf(log_buffer, msg_permlog, preq->rq_type, "Job", pjob->ji_qs.ji_jobid, preq->rq_user, preq->rq_host); log_event(PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_PERM, 0, preq); return NULL; } if ((t == IS_ARRAY_NO) && (pjob->ji_qs.ji_state == JOB_STATE_EXITING)) { /* special case Deletejob with "force" */ if ((preq->rq_type == PBS_BATCH_DeleteJob) && (preq->rq_extend != NULL) && (strcmp(preq->rq_extend, "force") == 0)) { return (pjob); } (void)sprintf(log_buffer, "%s, state=%d", msg_badstate, pjob->ji_qs.ji_state); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_BADSTATE, 0, preq); return NULL; } return (pjob); }
int job_route( job *jobp) /* job to route */ { int bad_state = 0; char *id = "job_route"; time_t life; struct pbs_queue *qp; long retry_time; /* see if the job is able to be routed */ switch (jobp->ji_qs.ji_state) { case JOB_STATE_TRANSIT: return(0); /* already going, ignore it */ /*NOTREACHED*/ break; case JOB_STATE_QUEUED: /* NO-OP */ break; /* ok to try */ case JOB_STATE_HELD: /* job may be acceptable */ bad_state = !jobp->ji_qhdr->qu_attr[QR_ATR_RouteHeld].at_val.at_long; break; case JOB_STATE_WAITING: /* job may be acceptable */ bad_state = !jobp->ji_qhdr->qu_attr[QR_ATR_RouteWaiting].at_val.at_long; break; case JOB_STATE_COMPLETE: /* job has been deleted */ job_purge(jobp); return(0); /*NOTREACHED*/ break; default: sprintf(log_buffer, "%s %d", pbse_to_txt(PBSE_BADSTATE), jobp->ji_qs.ji_state); strcat(log_buffer, id); log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); return(0); /*NOTREACHED*/ break; } /* check the queue limits, can we route any (more) */ qp = jobp->ji_qhdr; if (qp->qu_attr[(int)QA_ATR_Started].at_val.at_long == 0) { /* queue not started - no routing */ return(0); } if ((qp->qu_attr[(int)QA_ATR_MaxRun].at_flags & ATR_VFLAG_SET) && (qp->qu_attr[(int)QA_ATR_MaxRun].at_val.at_long <= qp->qu_njstate[JOB_STATE_TRANSIT])) { /* max number of jobs being routed */ return(0); } /* what is the retry time and life time of a job in this queue */ if (qp->qu_attr[(int)QR_ATR_RouteRetryTime].at_flags & ATR_VFLAG_SET) { retry_time = (long)time_now + qp->qu_attr[(int)QR_ATR_RouteRetryTime].at_val.at_long; } else { retry_time = (long)time_now + PBS_NET_RETRY_TIME; } if (qp->qu_attr[(int)QR_ATR_RouteLifeTime].at_flags & ATR_VFLAG_SET) { life = jobp->ji_qs.ji_un.ji_routet.ji_quetime + qp->qu_attr[(int)QR_ATR_RouteLifeTime].at_val.at_long; } else { life = 0; /* forever */ } if (life && (life < time_now)) { log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, msg_routexceed); /* job too long in queue */ return(PBSE_ROUTEEXPD); } if (bad_state) { /* not currently routing this job */ return(0); } if (qp->qu_attr[(int)QR_ATR_AltRouter].at_val.at_long == 0) { return(default_router(jobp, qp, retry_time)); } return(site_alt_router(jobp, qp, retry_time)); } /* END job_route() */
static void post_routejob( struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; char *id = "post_routejob"; job *jobp = (job *)pwt->wt_parm1; if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = 2; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case 0: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); job_purge(jobp); /* need to remove server job struct */ return; /*NOTREACHED*/ break; case 1: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); if ((r = job_route(jobp)) == PBSE_ROUTEREJ) job_abt(&jobp, pbse_to_txt(PBSE_ROUTEREJ)); else if (r != 0) job_abt(&jobp, msg_routexceed); break; } /* END switch (r) */ return; } /* END post_routejob() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; struct work_task *pwtnew; pbs_queue *pque; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt; /* original client request */ int rc; preq_sig = pwt->wt_parm1; rc = preq_sig->rq_reply.brp_code; preq_clt = preq_sig->rq_extra; release_req(pwt); pjob = find_job(preq_clt->rq_ind.rq_delete.rq_objname); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { pque = pjob->ji_qhdr; delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); } pwtnew = set_task(WORK_Timed, delay + time_now, post_delete_mom2, pjob); if (pwtnew) { /* insure that work task will be removed if job goes away */ append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); return; } /* END post_delete_mom1() */
static void post_movejob( struct work_task *pwt) { char *id = "post_movejob"; struct batch_request *req; int newstate; int newsub; int stat; int r; job *jobp; req = (struct batch_request *)pwt->wt_parm2; stat = pwt->wt_aux; pbs_errno = PBSE_NONE; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buffer, "bad request type %d\n", req->rq_type); log_err(-1, id, log_buffer); return; } jobp = find_job(req->rq_ind.rq_move.rq_jid); if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm1)) { sprintf(log_buffer, "job %s not found\n", req->rq_ind.rq_move.rq_jid); log_err(-1, id, log_buffer); } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); if (r == 0) { /* purge server's job structure */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(jobp); strcpy(log_buffer, msg_movejob); sprintf(log_buffer + strlen(log_buffer), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); job_purge(jobp); } else { r = PBSE_ROUTEREJ; } } else { r = PBSE_SYSTEM; sprintf(log_buffer, msg_badexit, stat); strcat(log_buffer, id); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); } if (r) { if (jobp != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); } req_reject(r, 0, req, NULL, NULL); } else { reply_ack(req); } return; } /* END post_movejob() */
/** * @brief * post_routejob - clean up action for child started in net_move/send_job * to "route" a job to another server * @par * If route was successfull, delete job. * @par * If route didn't work, mark destination not to be tried again for this * job and call route again. * * @param[in] pwt - work task structure * * @return none. */ static void post_routejob(struct work_task *pwt) { int newstate; int newsub; int r; int stat = pwt->wt_aux; job *jobp = (job *)pwt->wt_parm2; if (jobp == NULL) { log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, "", "post_routejob failed, jobp NULL"); return; } if (WIFEXITED(stat)) { r = WEXITSTATUS(stat); } else { r = SEND_JOB_FATAL; (void)sprintf(log_buffer, msg_badexit, stat); (void)strcat(log_buffer, __func__); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); } switch (r) { case SEND_JOB_OK: /* normal return, job was routed */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(jobp); /* * If the server is configured to keep job history and the job * is created here, do not purge the job structure but save * it for history purpose. No need to check for sub-jobs as * sub jobs can not be routed. */ if (svr_chk_history_conf()) svr_setjob_histinfo(jobp, T_MOV_JOB); else job_purge(jobp); /* need to remove server job struct */ return; case SEND_JOB_FATAL: /* permanent rejection (or signal) */ if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* Job Delete in progress, just set to queued status */ (void)svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT); return; } add_dest(jobp); /* else mark destination as bad */ /* fall through */ default : /* try routing again */ /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); (void)svr_setjobstate(jobp, newstate, newsub); jobp->ji_retryok = 1; if ((r = job_route(jobp)) == PBSE_ROUTEREJ) (void)job_abt(jobp, msg_routebad); else if (r != 0) (void)job_abt(jobp, msg_routexceed); break; } return; }