int add_walltime_remaining( int index, attribute *pattr, tlist_head *phead) { int len; char buf[MAXPATHLEN]; char *pname; svrattrl *pal; resource *pres; int found = 0; unsigned long remaining = 0; unsigned long upperBound = 0; /* encode walltime remaining, this is custom because walltime * remaining isn't an attribute */ if ((pattr + JOB_ATR_state)->at_val.at_char != 'R') { /* only for running jobs, do nothing */ return(PBSE_NONE); } if (((pattr + JOB_ATR_resource)->at_val.at_list.ll_next != NULL) && ((pattr + JOB_ATR_resource)->at_flags & ATR_VFLAG_SET)) { pres = (resource *)GET_NEXT((pattr + JOB_ATR_resource)->at_val.at_list); if ((pattr + JOB_ATR_comp_time)->at_flags & ATR_VFLAG_SET) upperBound = (pattr + JOB_ATR_comp_time)->at_val.at_long; else upperBound = (unsigned long)time_now; /* find the walltime resource */ for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link)) { pname = pres->rs_defin->rs_name; if (strcmp(pname, "walltime") == 0) { /* found walltime */ unsigned long value = (unsigned long)pres->rs_value.at_val.at_long; remaining = value - (time_now - (pattr + index)->at_val.at_long); found = TRUE; break; } } } if (found == TRUE) { snprintf(buf,sizeof(buf),"%lu",remaining); len = strlen(buf) + 1; pal = attrlist_create("Walltime","Remaining",len); if (pal != NULL) { memcpy(pal->al_value,buf,len); pal->al_flags = ATR_VFLAG_SET; append_link(phead,&pal->al_link,pal); } } return(PBSE_NONE); } /* END add_walltime_remaining() */
int encode_svrstate( pbs_attribute *pattr, /* ptr to pbs_attribute */ tlist_head *phead, /* head of attrlist list */ const char *atname, /* pbs_attribute name */ const char *rsname, /* null */ int mode, /* encode mode */ int perm) /* only used for resources */ { svrattrl *pal; const char *psname; if (pattr == NULL) { /* FAILURE */ return(-1); } if ((mode == ATR_ENCODE_SAVE) || (pattr->at_val.at_long <= SV_STATE_DOWN) || (pattr->at_val.at_long > SV_STATE_SHUTSIG)) { /* SUCCESS */ return(0); /* don't bother to encode it */ } psname = svr_state_names[pattr->at_val.at_long]; if (pattr->at_val.at_long == SV_STATE_RUN) { pthread_mutex_lock(server.sv_attr_mutex); if (server.sv_attr[SRV_ATR_scheduling].at_val.at_long == 0) psname = svr_idle; else { pthread_mutex_lock(scheduler_sock_jobct_mutex); if (scheduler_sock != -1) psname = svr_sched; pthread_mutex_unlock(scheduler_sock_jobct_mutex); } pthread_mutex_unlock(server.sv_attr_mutex); } pal = attrlist_create(atname, rsname, strlen(psname) + 1); if (pal == NULL) { /* FAILURE */ return(-1); } strcpy(pal->al_value, psname); pal->al_flags = pattr->at_flags; append_link(phead, &pal->al_link, pal); /* SUCCESS */ return(1); } /* END encode_svrstate() */
int decode_DIS_svrattrl(int sock, tlist_head *phead) { unsigned int i; unsigned int hasresc; size_t ls; unsigned int data_len; unsigned int numattr; svrattrl *psvrat = NULL; int rc; size_t tsize; numattr = disrui(sock, &rc); /* number of attributes in set */ if (rc) return rc; for (i = 0; i < numattr; ++i) { data_len = disrui(sock, &rc); /* here it is used */ if (rc) return rc; tsize = sizeof(svrattrl) + data_len; if ((psvrat = (svrattrl *)malloc(tsize)) == 0) return DIS_NOMALLOC; CLEAR_LINK(psvrat->al_link); psvrat->al_atopl.next = 0; psvrat->al_tsize = tsize; psvrat->al_name = (char *)psvrat + sizeof(svrattrl); psvrat->al_resc = 0; psvrat->al_value = 0; psvrat->al_nameln = 0; psvrat->al_rescln = 0; psvrat->al_valln = 0; psvrat->al_flags = 0; if ((rc = disrfcs(sock, &ls, data_len, psvrat->al_name))) break; *(psvrat->al_name + ls++) = '\0'; psvrat->al_nameln = (int)ls; data_len -= ls; hasresc = disrui(sock, &rc); if (rc) break; if (hasresc) { psvrat->al_resc = psvrat->al_name + ls; if ((rc = disrfcs(sock, &ls, data_len, psvrat->al_resc))) break; *(psvrat->al_resc + ls++) = '\0'; psvrat->al_rescln = (int)ls; data_len -= ls; } psvrat->al_value = psvrat->al_name + psvrat->al_nameln + psvrat->al_rescln; if ((rc = disrfcs(sock, &ls, data_len, psvrat->al_value))) break; *(psvrat->al_value + ls++) = '\0'; psvrat->al_valln = (int)ls; psvrat->al_op = (enum batch_op)disrui(sock, &rc); if (rc) break; append_link(phead, &psvrat->al_link, psvrat); } if (rc) { (void)free(psvrat); } return (rc); }
static int status_que( pbs_queue *pque, /* ptr to que to status */ struct batch_request *preq, tlist_head *pstathd) /* head of list to append status to */ { struct brp_status *pstat; svrattrl *pal; int bad = 0; if ((preq->rq_perm & ATR_DFLAG_RDACC) == 0) { return(PBSE_PERM); } /* ok going to do status, update count and state counts from qu_qs */ pque->qu_attr[QA_ATR_TotalJobs].at_val.at_long = pque->qu_numjobs; pque->qu_attr[QA_ATR_TotalJobs].at_flags |= ATR_VFLAG_SET; update_state_ct( &pque->qu_attr[QA_ATR_JobsByState], pque->qu_njstate, pque->qu_jobstbuf); /* allocate status sub-structure and fill in header portion */ pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status)); if (pstat == NULL) { return(PBSE_SYSTEM); } memset(pstat, 0, sizeof(struct brp_status)); pstat->brp_objtype = MGR_OBJ_QUEUE; strcpy(pstat->brp_objname, pque->qu_qs.qu_name); CLEAR_LINK(pstat->brp_stlink); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); if (status_attrib( pal, que_attr_def, pque->qu_attr, QA_ATR_LAST, preq->rq_perm, &pstat->brp_attr, &bad, 1) != 0) /* IsOwner == TRUE */ { return(bad); } return(PBSE_NONE); } /* END status_que() */
int req_stat_svr( struct batch_request *preq) /* ptr to the decoded request */ { svrattrl *pal; struct batch_reply *preply; struct brp_status *pstat; int bad = 0; char nc_buf[128]; int numjobs; int netrates[3]; memset(netrates, 0, sizeof(netrates)); /* update count and state counts from sv_numjobs and sv_jobstates */ lock_sv_qs_mutex(server.sv_qs_mutex, __func__); numjobs = server.sv_qs.sv_numjobs; unlock_sv_qs_mutex(server.sv_qs_mutex, __func__); pthread_mutex_lock(server.sv_attr_mutex); server.sv_attr[SRV_ATR_TotalJobs].at_val.at_long = numjobs; server.sv_attr[SRV_ATR_TotalJobs].at_flags |= ATR_VFLAG_SET; pthread_mutex_lock(server.sv_jobstates_mutex); update_state_ct( &server.sv_attr[SRV_ATR_JobsByState], server.sv_jobstates, server.sv_jobstbuf); pthread_mutex_unlock(server.sv_jobstates_mutex); netcounter_get(netrates); snprintf(nc_buf, 127, "%d %d %d", netrates[0], netrates[1], netrates[2]); if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL) free(server.sv_attr[SRV_ATR_NetCounter].at_val.at_str); server.sv_attr[SRV_ATR_NetCounter].at_val.at_str = strdup(nc_buf); if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL) server.sv_attr[SRV_ATR_NetCounter].at_flags |= ATR_VFLAG_SET; pthread_mutex_unlock(server.sv_attr_mutex); /* allocate a reply structure and a status sub-structure */ preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status)); if (pstat == NULL) { reply_free(preply); req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); pthread_mutex_unlock(server.sv_attr_mutex); return(PBSE_SYSTEM); } CLEAR_LINK(pstat->brp_stlink); strcpy(pstat->brp_objname, server_name); pstat->brp_objtype = MGR_OBJ_SERVER; CLEAR_HEAD(pstat->brp_attr); append_link(&preply->brp_un.brp_status, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); if (status_attrib( pal, svr_attr_def, server.sv_attr, SRV_ATR_LAST, preq->rq_perm, &pstat->brp_attr, &bad, 1)) /* IsOwner == TRUE */ { reply_badattr(PBSE_NOATTR, bad, pal, preq); } else { reply_send_svr(preq); } return(PBSE_NONE); } /* END req_stat_svr() */
void req_commit( struct batch_request *preq) /* I */ { job *pj; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "starting job execution"); } start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, 0, tmpLine); } else { reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit); } job_save(pj, SAVEJOB_FULL); /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
int encode_state( pbs_attribute *pattr, /*struct pbs_attribute being encoded */ tlist_head *ph, /*head of a list of "svrattrl" structs which are to be returned*/ char *aname, /*pbs_attribute's name */ char *rname, /*resource's name (null if none) */ int mode, /*mode code, unused here */ int perm) /* only used for resources */ { int i; svrattrl *pal; short state; char state_str[MAX_ENCODE_BFR]; if (!pattr) { return -(PBSE_INTERNAL); } if (!(pattr->at_flags & ATR_VFLAG_SET)) { /* SUCCESS - pbs_attribute not set */ return(0); } state = pattr->at_val.at_short & (INUSE_SUBNODE_MASK | INUSE_UNKNOWN); if (!state) { strcpy(state_str, ND_free); } else { state_str[0] = '\0'; for (i = 0;ns[i].name;i++) { if (state & ns[i].bit) { if (state_str[0] != '\0') strcat(state_str, ","); strcat(state_str, ns[i].name); } } } pal = attrlist_create(aname, rname, (int)strlen(state_str) + 1); if (pal == NULL) { return -(PBSE_SYSTEM); } strcpy(pal->al_value, state_str); pal->al_flags = ATR_VFLAG_SET; append_link(ph, &pal->al_link, pal); /* SUCCESS */ return(0); } /* END encode_state */
int status_job(job *pjob, struct batch_request *preq, svrattrl *pal, pbs_list_head *pstathd, int *bad) { struct brp_status *pstat; time_t tm; long oldtime = 0; int old_elig_flags = 0; int old_atyp_flags = 0; /* see if the client is authorized to status this job */ if (! server.sv_attr[(int)SRV_ATR_query_others].at_val.at_long) if (svr_authorize_jobreq(preq, pjob)) return (PBSE_PERM); if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) { /* for Array Job, if array_indices_remaining is modified */ /* then need to recalculate the string value */ char *pnewstr; attribute *premain; premain = &pjob->ji_wattr[(int)JOB_ATR_array_indices_remaining]; if (premain->at_flags & ATR_VFLAG_MODCACHE) { pnewstr = cvt_range(pjob->ji_ajtrk, JOB_STATE_QUEUED); if (pnewstr == NULL) pnewstr = "-"; job_attr_def[JOB_ATR_array_indices_remaining].at_free(premain); job_attr_def[JOB_ATR_array_indices_remaining].at_decode(premain, 0, 0, pnewstr); /* also update value of attribute "array_state_count" */ update_subjob_state_ct(pjob); } } /* calc eligible time on the fly and return, don't save. */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long != 0) { if (pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) { time(&tm); oldtime = pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += ((long)tm - pjob->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long); pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; /* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */ /* not correctly check ATR_VFLAG_SET */ } } else { /* eligible_time_enable is off so, */ /* clear set flag so that eligible_time and accrue type dont show */ old_elig_flags = pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= (ATR_VFLAG_MODCACHE); old_atyp_flags = pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags |= (ATR_VFLAG_MODCACHE); /* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */ /* not correctly check ATR_VFLAG_SET */ } /* allocate reply structure and fill in header portion */ pstat = (struct brp_status *)malloc(sizeof(struct brp_status)); if (pstat == (struct brp_status *)0) return (PBSE_SYSTEM); CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; (void)strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; if (status_attrib(pal, job_attr_def, pjob->ji_wattr, JOB_ATR_LAST, preq->rq_perm, &pstat->brp_attr, bad)) return (PBSE_NOATTR); /* reset eligible time, it was calctd on the fly, real calctn only when accrue_type changes */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long != 0) { if (pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) { pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long = oldtime; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; /* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */ /* not correctly check ATR_VFLAG_SET */ } } else { /* reset the set flags */ pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags = old_elig_flags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags = old_atyp_flags; } return (0); }
/** * @brief * status_subjob - status a single subjob (of an Array Job) * Works by statusing the parrent unless subjob is actually running. * * @param[in,out] pjob - ptr to parent Array * @param[in] preq - request structure * @param[in] pal - specific attributes to status * @param[in] subj - if not = -1 then include subjob [n] * @param[in,out] pstathd - RETURN: head of list to append status to * @param[out] bad - RETURN: index of first bad attribute * * @return int * @retval 0 : success * @retval PBSE_PERM : client is not authorized to status the job * @retval PBSE_SYSTEM : memory allocation error * @retval PBSE_IVALREQ : something wrong with the flags */ int status_subjob(job *pjob, struct batch_request *preq, svrattrl *pal, int subj, pbs_list_head *pstathd, int *bad) { int limit = (int)JOB_ATR_LAST; struct brp_status *pstat; job *psubjob; /* ptr to job to status */ char realstate; int rc = 0; int oldeligflags = 0; int oldatypflags = 0; int subjob_state = -1; char *old_subjob_comment = NULL; /* see if the client is authorized to status this job */ if (! server.sv_attr[(int)SRV_ATR_query_others].at_val.at_long) if (svr_authorize_jobreq(preq, pjob)) return (PBSE_PERM); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) == 0) return PBSE_IVALREQ; /* if subjob is running, use real job structure */ if (get_subjob_state(pjob, subj) == JOB_STATE_RUNNING) { psubjob = find_job(mk_subjob_id(pjob, subj)); if (psubjob) status_job(psubjob, preq, pal, pstathd, bad); return 0; } /* otherwise we fake it with info from the parent */ /* allocate reply structure and fill in header portion */ /* for the general case, we don't want to include the parent's */ /* array related attrbutes as they belong only to the Array */ if (pal == NULL) limit = JOB_ATR_array; pstat = (struct brp_status *)malloc(sizeof(struct brp_status)); if (pstat == (struct brp_status *)0) return (PBSE_SYSTEM); CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; (void)strcpy(pstat->brp_objname, mk_subjob_id(pjob, subj)); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; /* * fake the job state and comment by setting the parent job's state * and comment to that of the subjob */ subjob_state = get_subjob_state(pjob, subj); realstate = pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char; pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = statechars[subjob_state]; pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE; if (subjob_state == JOB_STATE_EXPIRED || subjob_state == JOB_STATE_FINISHED) { if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FINISHED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob finished") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FAILED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob failed") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_TERMINATED) { if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) { old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str); if (old_subjob_comment == (char *)0) return (PBSE_SYSTEM); } if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, "Subjob terminated") == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } } } /* when eligible_time_enable is off, */ /* clear the set flag so that eligible_time and accrue_type dont show */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) { oldeligflags = pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; oldatypflags = pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags |= ATR_VFLAG_MODCACHE; /* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */ /* not correctly check ATR_VFLAG_SET */ } if (status_attrib(pal, job_attr_def, pjob->ji_wattr, limit, preq->rq_perm, &pstat->brp_attr, bad)) rc = PBSE_NOATTR; /* Set the parent state back to what it really is */ pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = realstate; pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE; /* Set the parent comment back to what it really is */ if (old_subjob_comment != NULL) { if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, old_subjob_comment) == PBSE_SYSTEM) { free(old_subjob_comment); return (PBSE_SYSTEM); } free(old_subjob_comment); } /* reset the flags */ if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) { pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags = oldeligflags; pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags = oldatypflags; } return (rc); }
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; int qs_upgrade; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif qs_upgrade = FALSE; pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } strcpy(namebuf, path_jobs); /* job directory path */ strcat(namebuf, filename); fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { sprintf(log_buffer, "unable to open %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize && pj->ji_qs.qs_version == PBS_QS_VERSION) { sprintf(log_buffer, "Unable to read %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ sprintf(log_buffer, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, "job_recov", log_buffer); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { sprintf(log_buffer, "unable to upgrade %s\n", namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } qs_upgrade = TRUE; } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ sprintf(log_buffer, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, (int)JOB_ATR_LAST, (int)JOB_ATR_UNKN, TRUE) != 0) { sprintf(log_buffer, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, "job_recov", log_buffer); job_free(pj); close(fds); return(NULL); } #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } if (recov_roottask(fds, pj) != 0) { sprintf(log_buffer, "warning: root task not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } if (recov_jobflags(fds, pj) != 0) { sprintf(log_buffer, "warning: job flags not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } #else /* PBS_MOM */ if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_isparent = TRUE; } else { if (pa == NULL) { /* couldn't find array struct, it must not have been recovered, treat job as indepentent job? perhaps we should delete the job XXX_JOB_ARRAY: should I unset this?*/ pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags &= ~ATR_VFLAG_SET; } else { CLEAR_LINK(pj->ji_arrayjobs); append_link(&pa->array_alljobs, &pj->ji_arrayjobs, (void*)pj); pj->ji_arraystruct = pa; pa->jobs_recovered++; } } } #endif close(fds); /* all done recovering the job */ if (qs_upgrade == TRUE) { job_save(pj, SAVEJOB_FULL); } return(pj); } /* END job_recov() */
int encode_time( attribute *attr, /* ptr to attribute (value in attr->at_val.at_long) */ tlist_head *phead, /* head of attrlist list (optional) */ char *atname, /* attribute name */ char *rsname, /* resource name (optional) */ int mode) /* encode mode (not used) */ { size_t ct; char cvnbuf[CVNBUFSZ]; int hr; int min; long n; svrattrl *pal; int sec; char *pv; if (attr == NULL) { /* FAILURE */ return(-1); } if (!(attr->at_flags & ATR_VFLAG_SET)) { return(0); } n = attr->at_val.at_long; hr = n / 3600; n = n % 3600; min = n / 60; n = n % 60; sec = n; pv = cvnbuf; sprintf(pv, "%02d:%02d:%02d", hr, min, sec); pv += strlen(pv); ct = strlen(cvnbuf) + 1; if (phead != NULL) { pal = attrlist_create(atname, rsname, ct); if (pal == NULL) { /* FAILURE */ return(-1); } memcpy(pal->al_value, cvnbuf, ct); pal->al_flags = attr->at_flags; append_link(phead, &pal->al_link, pal); } else { strcpy(atname, cvnbuf); } /* SUCCESS */ return(1); }
/** * attempt_delete() * deletes a job differently depending on the job's state * * @return TRUE if the job was deleted, FALSE if skipped * @param pjob - a pointer to the job being handled */ int attempt_delete( void *j) /* I */ { int skipped = FALSE; struct work_task *pwtold; struct work_task *pwtnew; job *pjob; /* job considered deleted if null */ if (j == NULL) return(TRUE); pjob = (job *)j; if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* * Find pid of router from existing work task entry, * then establish another work task on same child. * Next, signal the router and wait for its completion; */ pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask); while (pwtold != NULL) { if ((pwtold->wt_type == WORK_Deferred_Child) || (pwtold->wt_type == WORK_Deferred_Cmp)) { kill((pid_t)pwtold->wt_event, SIGTERM); pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT; } pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj); } skipped = TRUE; return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */ else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* we'll wait for the mom to get this job, then delete it */ skipped = TRUE; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* set up nanny */ if (!has_job_delete_nanny(pjob)) { apply_job_delete_nanny(pjob, time_now + 60); /* need to issue a signal to the mom, but we don't want to sent an ack to the * client when the mom replies */ issue_signal(pjob, "SIGTERM", post_delete, NULL); } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, NULL); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct work_task *ptask; struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE); if ((pque = pjob->ji_qhdr) && (pque != NULL)) { pque->qu_numcompleted++; } KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob); if (ptask != NULL) { append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } } return(!skipped); } /* END attempt_delete() */
void array_delete_wt(struct work_task *ptask) { struct batch_request *preq; job_array *pa; /*struct work_task *pnew_task;*/ struct work_task *pwtnew; int i; static int last_check = 0; static char *last_id = NULL; preq = ptask->wt_parm1; pa = get_array(preq->rq_ind.rq_delete.rq_objname); if (pa == NULL) { /* jobs must have exited already */ reply_ack(preq); last_check = 0; free(last_id); last_id = NULL; return; } if (last_id == NULL) { last_id = strdup(preq->rq_ind.rq_delete.rq_objname); last_check = time_now; } else if (strcmp(last_id, preq->rq_ind.rq_delete.rq_objname) != 0) { last_check = time_now; free(last_id); last_id = strdup(preq->rq_ind.rq_delete.rq_objname); } else if (time_now - last_check > 10) { int num_jobs; int num_prerun; job *pjob; num_jobs = 0; num_prerun = 0; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->jobs[i] == NULL) continue; pjob = (job *)pa->jobs[i]; num_jobs++; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { num_prerun++; /* mom still hasn't gotten job?? delete anyway */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ change_restart_comment_if_needed(pjob); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING); pjob->ji_momhandle = -1; /* force new connection */ pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob); if (pwtnew) { append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(pjob); job_abt(&pjob, NULL); } else { job_abt(&pjob, NULL); } } } if (num_jobs == num_prerun) { reply_ack(preq); free(last_id); last_id = NULL; return; } } req_deletearray(preq); }
int encode_arst( pbs_attribute *attr, /* I ptr to pbs_attribute to encode */ tlist_head *phead, /* O ptr to head of attrlist list */ char *atname, /* I pbs_attribute name */ char *rsname, /* I resource name or NULL (optional) */ int mode, /* I encode mode */ int perm) /* only used for resources */ { char *end; int i; int j; svrattrl *pal; char *pc; char *pfrom; char separator; if (attr == NULL) { /* FAILURE - invalid parameters */ return(-2); } if (((attr->at_flags & ATR_VFLAG_SET) == 0) || !attr->at_val.at_arst || !attr->at_val.at_arst->as_usedptr) { /* SUCCESS - empty list */ return(0); /* no values */ } i = (int)(attr->at_val.at_arst->as_next - attr->at_val.at_arst->as_buf); if (mode == ATR_ENCODE_SAVE) { separator = '\n'; /* new-line for encode_acl */ /* allow one extra byte for final new-line */ j = i + 1; } else { separator = ','; /* normally a comma is the separator */ j = i; } /* how many back-slashes are required */ for (pc = attr->at_val.at_arst->as_buf;pc < attr->at_val.at_arst->as_next;++pc) { if ((*pc == '"') || (*pc == '\'') || (*pc == ',') || (*pc == '\\') || (*pc == '\n')) ++j; } pal = attrlist_create(atname, rsname, j + 1); if (pal == NULL) { return(-1); } pal->al_flags = attr->at_flags; pc = pal->al_value; pfrom = attr->at_val.at_arst->as_buf; /* replace nulls between sub-strings with separater characters */ /* escape any embedded special character */ end = attr->at_val.at_arst->as_next; while (pfrom < end) { switch (*pfrom) { case '\0': *pc = separator; break; case '"': case '\'': case ',': case '\\': case '\n': *pc++ = '\\'; default: *pc = *pfrom; break; } pc++; pfrom++; } /* convert the last null to separator only if going to new-lines */ if (mode == ATR_ENCODE_SAVE) *pc = '\0'; /* insure string terminator */ else *(pc - 1) = '\0'; append_link(phead, &pal->al_link, pal); return(1); } /* END encode_arst() */
/** * @brief * main - the initialization and main loop of pbs_daemon */ int main(int argc, char *argv[]) { char jobfile[MAXPATHLEN+1]; char jobfile_full[MAXPATHLEN+1]; pbs_net_t hostaddr = 0; int port = -1; int move_type = -1; pbs_list_head attrl; enum conn_type cntype = ToServerDIS; int con = -1; char *destin; int encode_type; int i; job *jobp; char job_id[PBS_MAXSVRJOBID+1]; attribute *pattr; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char script_name[MAXPATHLEN+1]; int in_server = -1; char *param_name, *param_val; char buf[4096]; struct hostent *hp; struct in_addr addr; char *credbuf = NULL; size_t credlen = 0; int prot = PROT_TCP; /*the real deal or output version and exit?*/ execution_mode(argc, argv); /* If we are not run with real and effective uid of 0, forget it */ pbs_loadconf(0); if (!isAdminPrivilege(getlogin())) { fprintf(stderr, "%s: Must be run by root\n", argv[0]); exit(SEND_JOB_FATAL); } /* initialize the pointers in the resource_def array */ for (i = 0; i < (svr_resc_size - 1); ++i) svr_resc_def[i].rs_next = &svr_resc_def[i+1]; /* last entry is left with null pointer */ /* set single threaded mode */ pbs_client_thread_set_single_threaded_mode(); /* disable attribute verification */ set_no_attribute_verification(); /* initialize the thread context */ if (pbs_client_thread_init_thread_context() != 0) { fprintf(stderr, "%s: Unable to initialize thread context\n", argv[0]); exit(SEND_JOB_FATAL); } if(set_msgdaemonname("PBS_send_job")) { fprintf(stderr, "Out of memory\n"); return 1; } winsock_init(); connection_init(); while (fgets(buf, sizeof(buf), stdin) != NULL) { buf[strlen(buf)-1] = '\0'; /* gets rid of newline */ param_name = buf; param_val = strchr(buf, '='); if (param_val) { *param_val = '\0'; param_val++; } else { /* bad param_val -- skipping */ break; } if (strcmp(param_name, "jobfile") == 0) { jobfile[0] = '\0'; strncpy(jobfile, param_val, MAXPATHLEN); } else if (strcmp(param_name, "destaddr") == 0) { hostaddr = atol(param_val); } else if (strcmp(param_name, "destport") == 0) { port = atoi(param_val); } else if (strcmp(param_name, "move_type") == 0) { move_type = atoi(param_val); } else if (strcmp(param_name, "in_server") == 0) { in_server = atoi(param_val); } else if (strcmp(param_name, "server_name") == 0) { server_name[0] = '\0'; strncpy(server_name, param_val, PBS_MAXSERVERNAME); } else if (strcmp(param_name, "server_host") == 0) { server_host[0] = '\0'; strncpy(server_host, param_val, (sizeof(server_host) - 1)); } else if (strcmp(param_name, "server_addr") == 0) { pbs_server_addr = atol(param_val); } else if (strcmp(param_name, "server_port") == 0) { pbs_server_port_dis = atoi(param_val); } else if (strcmp(param_name, "log_file") == 0) { log_file = strdup(param_val); } else if (strcmp(param_name, "path_log") == 0) { path_log[0] = '\0'; strncpy(path_log, param_val, MAXPATHLEN); } else if (strcmp(param_name, "path_jobs") == 0) { path_jobs = strdup(param_val); } else if (strcmp(param_name, "path_spool") == 0) { path_spool = strdup(param_val); } else if (strcmp(param_name, "path_rescdef") == 0) { path_rescdef = strdup(param_val); } else if (strcmp(param_name, "path_users") == 0) { path_users = strdup(param_val); } else if (strcmp(param_name, "path_hooks_workdir") == 0) { path_hooks_workdir = strdup(param_val); if (path_hooks_workdir == NULL) exit(SEND_JOB_FATAL); } else if (strcmp(param_name, "svr_history_enable") == 0) { svr_history_enable = atol(param_val); } else if (strcmp(param_name, "svr_history_duration") == 0) { svr_history_duration = atol(param_val); } else if (strcmp(param_name, "single_signon_password_enable") == 0) { if (decode_b(&server.sv_attr[(int)SRV_ATR_ssignon_enable], NULL, NULL, param_val) != 0) { fprintf(stderr, "%s: failed to set ssignon_password_enable\n", argv[0]); exit(SEND_JOB_FATAL); } } else if (strcmp(param_name, "script_name") == 0) { strncpy(script_name, param_val, MAXPATHLEN + 1); } else break; } time(&time_now); (void)log_open_main(log_file, path_log, 1); /* silent open */ if (setup_resc(1) == -1) { /* log_buffer set in setup_resc */ log_err(-1, "pbsd_send_job(setup_resc)", log_buffer); return (-1); } if( strlen(jobfile) == 0 || hostaddr == 0 || port == 0 || move_type == -1 || \ in_server == -1 || strlen(server_name) == 0 || strlen(server_host) == 0 || \ pbs_server_addr == 0 || pbs_server_port_dis == 0 || \ strlen(path_log) == 0 || path_jobs == NULL || \ path_spool == NULL || path_users == NULL ) { log_err(-1, "pbs_send_job", "error on one of the parameters"); log_close(0); /* silent close */ exit(SEND_JOB_FATAL); } CLEAR_HEAD(task_list_immed); CLEAR_HEAD(task_list_timed); CLEAR_HEAD(task_list_event); CLEAR_HEAD(svr_queues); CLEAR_HEAD(svr_alljobs); CLEAR_HEAD(svr_newjobs); CLEAR_HEAD(svr_allresvs); CLEAR_HEAD(svr_newresvs); CLEAR_HEAD(svr_deferred_req); CLEAR_HEAD(svr_unlicensedjobs); strcpy(jobfile_full, path_jobs); strcat(jobfile_full, jobfile); if (chk_save_file(jobfile_full) != 0) { sprintf(log_buffer, "Error opening jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } if ((jobp=job_recov_fs(jobfile, RECOV_SUBJOB)) == NULL) { sprintf(log_buffer, "Failed to recreate job in jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } /* now delete the temp job file that was created by job_save_fs in server code * jobs are in database now, no need to keep in filesystem */ unlink(jobfile_full); if (in_server) append_link(&svr_alljobs, &jobp->ji_alljobs, jobp); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; svr_dequejob(jobp); } CLEAR_HEAD(attrl); pattr = jobp->ji_wattr; for (i=0; i < (int)JOB_ATR_LAST; i++) { if ((job_attr_def+i)->at_flags & resc_access_perm) { (void)(job_attr_def+i)->at_encode(pattr+i, &attrl, (job_attr_def+i)->at_name, NULL, encode_type, NULL); } } attrl_fixlink(&attrl); /* script name is passed from parent */ /* get host name */ pbs_loadconf(0); addr.s_addr = htonl(hostaddr); hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET); if (hp == NULL) { sprintf(log_buffer, "%s: h_errno=%d", inet_ntoa(addr), h_errno); log_err(-1, __func__, log_buffer); } else { /* read any credential file */ (void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } /* save the job id for when after we purge the job */ (void)strcpy(job_id, jobp->ji_qs.ji_jobid); con = -1; DIS_tcparray_init(); for (i=0; i<RETRY; i++) { pbs_errno = 0; /* connect to receiving server with retries */ if (i > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); if (should_retry_route(pbs_errno) == -1) { goto fatal_exit; /* fatal error, don't retry */ } sleep(1<<i); } if ((con = svr_connect(hostaddr, port, 0, cntype, prot)) == PBS_NET_RC_FATAL) { (void)sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, __func__, log_buffer); goto fatal_exit; } else if (con == PBS_NET_RC_RETRY) { pbs_errno = WSAECONNREFUSED; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the "read-to-commit/commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; destin = jobp->ji_qs.ji_destin; if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL, prot, NULL)== 0) { if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) { /* already running, mark it so */ log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, "Mom reports job already running"); goto ok_exit; } else if ((pbs_errno == PBSE_HOOKERROR) || (pbs_errno == PBSE_HOOK_REJECT) || (pbs_errno == PBSE_HOOK_REJECT_RERUNJOB) || (pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) { char name_buf[MAXPATHLEN+1]; int rfd; int len; char *reject_msg; int err; err = pbs_errno; reject_msg = pbs_geterrmsg(con); (void)snprintf(log_buffer, sizeof(log_buffer), "send of job to %s failed error = %d reject_msg=%s", destin, err, reject_msg?reject_msg:""); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); (void)strcpy(name_buf, path_hooks_workdir); (void)strcat(name_buf, jobp->ji_qs.ji_jobid); (void)strcat(name_buf, HOOK_REJECT_SUFFIX); if ((reject_msg != NULL) && (reject_msg[0] != '\0')) { if ((rfd = open(name_buf, O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) { snprintf(log_buffer, sizeof(log_buffer), "open of reject file %s failed: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } else { secure_file(name_buf, "Administrators", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED); setmode(rfd, O_BINARY); len = strlen(reject_msg)+1; /* write also trailing null char */ if (write(rfd, reject_msg, len) != len) { snprintf(log_buffer, sizeof(log_buffer), "write to file %s incomplete: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } close(rfd); } } if (err == PBSE_HOOKERROR) exit(SEND_JOB_HOOKERR); if (err == PBSE_HOOK_REJECT) exit(SEND_JOB_HOOK_REJECT); if (err == PBSE_HOOK_REJECT_RERUNJOB) exit(SEND_JOB_HOOK_REJECT_RERUNJOB); if (err == PBSE_HOOK_REJECT_DELETEJOB) exit(SEND_JOB_HOOK_REJECT_DELETEJOB); } else { (void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); continue; } } if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, prot, NULL) != 0) continue; } if (credlen > 0) { int ret; ret = PBSD_jcred(con, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, prot, NULL); if ((ret == 0) || (i == (RETRY - 1))) free(credbuf); /* free credbuf if credbuf is sent successfully OR */ /* at the end of all retry attempts */ if (ret != 0) continue; } if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con, jobp, StdOut, prot) != 0) || (move_job_file(con, jobp, StdErr, prot) != 0) || (move_job_file(con, jobp, Chkpt, prot) != 0)) continue; } jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; } if (PBSD_rdytocmt(con, job_id, prot, NULL) != 0) continue; if (PBSD_commit(con, job_id, prot, NULL) != 0) goto fatal_exit; goto ok_exit; /* This child process is all done */ } if (con >= 0) svr_disconnect(con); /* * If connection is actively refused by the execution node(or mother superior) OR * the execution node(or mother superior) is rejecting request with error * PBSE_BADHOST(failing to authorize server host), the node should be marked down. */ if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == WSAECONNREFUSED || pbs_errno == PBSE_BADHOST)) { i = SEND_JOB_NODEDW; } else if (should_retry_route(pbs_errno) == -1) { i = SEND_JOB_FATAL; } else { i = SEND_JOB_RETRY; } (void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); log_close(0); net_close(-1); unlink(script_name); exit(i); fatal_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_FATAL); ok_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_OK); }
static void svrcached(attribute *pat, pbs_list_head *phead, attribute_def *pdef) { svrattrl *working = NULL; svrattrl *wcopy; svrattrl *encoded; if (resc_access_perm & PRIV_READ) encoded = pat->at_priv_encoded; else encoded = pat->at_user_encoded; if (pat->at_flags & ATR_VFLAG_MODCACHE) /* free old cache value if the value has changed */ free_svrcache(pat); if ((encoded == NULL) || (pat->at_flags & ATR_VFLAG_MODCACHE)) { if (pat->at_flags & ATR_VFLAG_SET) { /* encode and cache new svrattrl structure */ (void)pdef->at_encode(pat, phead, pdef->at_name, (char *)0, ATR_ENCODE_CLIENT, &working); if (resc_access_perm & PRIV_READ) pat->at_priv_encoded = working; else pat->at_user_encoded = working; pat->at_flags &= ~ATR_VFLAG_MODCACHE; while (working) { working->al_refct++; /* incr ref count */ working = working->al_sister; } } } else { /* can use the existing cached svrattrl struture */ working = encoded; if (working->al_refct < 2) { while (working) { CLEAR_LINK(working->al_link); append_link(phead, &working->al_link, working); working->al_refct++; /* incr ref count */ working = working->al_sister; } } else { /* * already linked in, must make a copy to link * NOTE: the copy points to the original's data * so it should be freed by itself, hence the * ref count is set to 1 and the sisters are not * linked in */ while (working) { wcopy = malloc(sizeof(struct svrattrl)); if (wcopy) { *wcopy = *working; working = working->al_sister; CLEAR_LINK(wcopy->al_link); append_link(phead, &wcopy->al_link, wcopy); wcopy->al_refct = 1; wcopy->al_sister = NULL; } } } } }
void req_quejob( struct batch_request *preq) /* ptr to the decoded request */ { char *id = "req_quejob"; char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; if (PBSNodeCheckProlog) { check_state(1); mom_server_all_update_stat(); if (internal_state & INUSE_DOWN) { req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL); return; } } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, id, "request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { strcpy(basename,psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno,id,"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* unlink job from svr_alljobs since it will be placed on newjobs */ delete_link(&pj->ji_alljobs); } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR,1,psatl,preq); return; } pdef = &job_attr_def[index]; /* Is attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ job_purge(pj); reply_badattr(PBSE_ATTRRO,1,psatl,preq); return; } /* decode attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ job_purge(pj); reply_badattr(rc,1,psatl,preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { job_purge(pj); reply_badattr(rc,1,psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { delete_link(&pj->ji_alljobs); append_link(&svr_newjobs,&pj->ji_alljobs,pj); pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ close_conn(sock); job_purge(pj); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END req_quejob() */
int send_job( job *jobp, pbs_net_t hostaddr, /* host address, host byte order */ int port, /* service port, host byte order */ int move_type, /* move, route, or execute */ void (*post_func)(struct work_task *), /* after move */ void *data) /* ptr to optional batch_request to be put */ /* in the work task structure */ { tlist_head attrl; enum conn_type cntype = ToServerDIS; int con; char *destin = jobp->ji_qs.ji_destin; int encode_type; int i; int NumRetries; char *id = "send_job"; attribute *pattr; pid_t pid; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char *safail = "sigaction failed\n"; char *spfail = "sigprocmask failed\n"; char script_name[MAXPATHLEN + 1]; sigset_t child_set, all_set; struct sigaction child_action; struct work_task *ptask; mbool_t Timeout = FALSE; char *pc; sigemptyset(&child_set); sigaddset(&child_set, SIGCHLD); sigfillset(&all_set); /* block SIGCHLD until work task is established */ if (sigprocmask(SIG_BLOCK, &child_set, NULL) == -1) { log_err(errno,id,spfail); pbs_errno = PBSE_SYSTEM; log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "cannot set signal mask"); return(-1); } if (LOGLEVEL >= 6) { sprintf(log_buffer,"about to send job - type=%d", move_type); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "forking in send_job"); } pid = fork(); if (pid == -1) { /* error on fork */ log_err(errno, id, "fork failed\n"); if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); pbs_errno = PBSE_SYSTEM; return(-1); } if (pid != 0) { /* The parent (main server) */ /* create task to monitor job startup */ /* CRI: need way to report to scheduler job is starting, not started */ ptask = set_task(WORK_Deferred_Child, pid, post_func, jobp); if (ptask == NULL) { log_err(errno, id, msg_err_malloc); return(-1); } ptask->wt_parm2 = data; append_link( &((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); /* now can unblock SIGCHLD */ if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); if (LOGLEVEL >= 1) { extern long DispatchTime[]; extern job *DispatchJob[]; extern char *DispatchNode[]; extern time_t time_now; struct pbsnode *NP; /* record job dispatch time */ int jindex; for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == NULL) { DispatchTime[jindex] = time_now; DispatchJob[jindex] = jobp; if ((NP = PGetNodeFromAddr(hostaddr)) != NULL) DispatchNode[jindex] = NP->nd_name; else DispatchNode[jindex] = NULL; break; } } } /* SUCCESS */ return(2); } /* END if (pid != 0) */ /* * the child process * * set up signal catcher for error return */ rpp_terminate(); child_action.sa_handler = net_move_die; sigfillset(&child_action.sa_mask); child_action.sa_flags = 0; if (sigaction(SIGHUP, &child_action, NULL)) log_err(errno, id, safail); if (sigaction(SIGINT, &child_action, NULL)) log_err(errno, id, safail); if (sigaction(SIGQUIT, &child_action, NULL)) log_err(errno, id, safail); /* signal handling is set, now unblock */ if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); /* encode job attributes to be moved */ CLEAR_HEAD(attrl); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { /* moving job to MOM - ie job start */ resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { /* moving job to alternate server? */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; /* clear default resource settings */ svr_dequejob(jobp); } pattr = jobp->ji_wattr; for (i = 0;i < JOB_ATR_LAST;i++) { if (((job_attr_def + i)->at_flags & resc_access_perm) || ((strncmp((job_attr_def + i)->at_name,"session_id",10) == 0) && (jobp->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))) { (job_attr_def + i)->at_encode( pattr + i, &attrl, (job_attr_def + i)->at_name, NULL, encode_type); } } /* END for (i) */ attrl_fixlink(&attrl); /* put together the job script file name */ strcpy(script_name, path_jobs); if (jobp->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { strcat(script_name, jobp->ji_arraystruct->ai_qs.fileprefix); } else { strcat(script_name, jobp->ji_qs.ji_fileprefix); } strcat(script_name, JOB_SCRIPT_SUFFIX); pbs_errno = 0; con = -1; for (NumRetries = 0;NumRetries < RETRY;NumRetries++) { int rc; /* connect to receiving server with retries */ if (NumRetries > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); /* check pbs_errno from previous attempt */ if (should_retry_route(pbs_errno) == -1) { sprintf(log_buffer, "child failed in previous commit request for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, id, log_buffer); exit(1); /* fatal error, don't retry */ } sleep(1 << NumRetries); } /* NOTE: on node hangs, svr_connect is successful */ if ((con = svr_connect(hostaddr, port, 0, cntype)) == PBS_NET_RC_FATAL) { sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, id, log_buffer); exit(1); } if (con == PBS_NET_RC_RETRY) { pbs_errno = 0; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the "ready-to-commit/commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; job_save(jobp, SAVEJOB_QUICK, 0); } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; if ((pc = PBSD_queuejob( con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL)) == NULL) { if ((pbs_errno == PBSE_EXPIRED) || (pbs_errno == PBSE_READ_REPLY_TIMEOUT)) { /* queue job timeout based on pbs_tcp_timeout */ Timeout = TRUE; } if ((pbs_errno == PBSE_JOBEXIST) && (move_type == MOVE_TYPE_Exec)) { /* already running, mark it so */ log_event( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "MOM reports job already running"); exit(0); } sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); continue; } /* END if ((pc = PBSD_queuejob() == NULL) */ free(pc); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, jobp->ji_qs.ji_jobid) != 0) continue; } /* XXX may need to change the logic below, if we are sending the job to a mom on the same host and the mom and server are not sharing the same spool directory, then we still need to move the file */ if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con,jobp,StdOut) != 0) || (move_job_file(con,jobp,StdErr) != 0) || (move_job_file(con,jobp,Checkpoint) != 0)) { continue; } } /* ignore signals */ if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; job_save(jobp, SAVEJOB_QUICK, 0); } else { /* ignore signals */ if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); } if (PBSD_rdytocmt(con, jobp->ji_qs.ji_jobid) != 0) { if (sigprocmask(SIG_UNBLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); continue; } if ((rc = PBSD_commit(con, jobp->ji_qs.ji_jobid)) != 0) { int errno2; /* NOTE: errno is modified by log_err */ errno2 = errno; sprintf(log_buffer, "send_job commit failed, rc=%d (%s)", rc, (connection[con].ch_errtxt != NULL) ? connection[con].ch_errtxt : "N/A"); log_ext(errno2, id, log_buffer, LOG_WARNING); /* if failure occurs, pbs_mom should purge job and pbs_server should set * job state to idle w/error msg */ if (errno2 == EINPROGRESS) { /* request is still being processed */ /* increase tcp_timeout in qmgr? */ Timeout = TRUE; /* do we need a continue here? */ sprintf(log_buffer, "child commit request timed-out for job %s, increase tcp_timeout?", jobp->ji_qs.ji_jobid); log_ext(errno2, id, log_buffer, LOG_WARNING); /* don't retry on timeout--break out and report error! */ break; } else { sprintf(log_buffer, "child failed in commit request for job %s", jobp->ji_qs.ji_jobid); log_ext(errno2, id, log_buffer, LOG_CRIT); /* FAILURE */ exit(1); } } /* END if ((rc = PBSD_commit(con,jobp->ji_qs.ji_jobid)) != 0) */ svr_disconnect(con); /* child process is done */ /* SUCCESS */ exit(0); } /* END for (NumRetries) */ if (con >= 0) svr_disconnect(con); if (Timeout == TRUE) { /* 10 indicates that job migrate timed out, server will mark node down * and abort the job - see post_sendmom() */ sprintf(log_buffer, "child timed-out attempting to start job %s", jobp->ji_qs.ji_jobid); log_ext(pbs_errno, id, log_buffer, LOG_WARNING); exit(10); } if (should_retry_route(pbs_errno) == -1) { sprintf(log_buffer, "child failed and will not retry job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, id, log_buffer); exit(1); } exit(2); /*NOTREACHED*/ return(0); } /* END send_job() */
void req_register(struct batch_request *preq) { int made; attribute *pattr; struct depend *pdep; struct depend_job *pdj; job *pjob; char *ps; struct work_task *ptask; int rc = 0; int revtype; int type; int savetype = SAVEJOB_FULL; /* make sure request is from a server */ if (!preq->rq_fromsvr) { #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Dependency request not from server"); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ req_reject(PBSE_IVALREQ, 0, preq); return; } /* find the "parent" job specified in the request */ if ((pjob = find_job(preq->rq_ind.rq_register.rq_parent)) == NULL) { /* * job not found... if server is initializing, it may not * yet recovered, that is not an error. */ if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long != SV_STATE_INIT) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, msg_unkjobid); req_reject(PBSE_UNKJOBID, 0, preq); } else { reply_ack(preq); } return; } pattr = &pjob->ji_wattr[(int)JOB_ATR_depend]; type = preq->rq_ind.rq_register.rq_dependtype; pjob->ji_modified = 1; /* more of the server:port fix kludge */ ps = strchr(preq->rq_ind.rq_register.rq_child, (int)'@'); if (ps != NULL) { (void)strcpy(preq->rq_ind.rq_register.rq_svr, ps+1); *ps = '\0'; } else { (void)strcpy(preq->rq_ind.rq_register.rq_svr, preq->rq_host); } if (pjob->ji_qs.ji_state == JOB_STATE_MOVED) { snprintf(log_buffer, sizeof(log_buffer), "Parent %s%s", msg_movejob, pjob->ji_qs.ji_destin); log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_INFO, preq->rq_ind.rq_register.rq_child, log_buffer); req_reject(PBSE_JOB_MOVED, 0, preq); return; } switch (preq->rq_ind.rq_register.rq_op) { /* * Register a dependency */ case JOB_DEPEND_OP_REGISTER: switch (type) { case JOB_DEPEND_TYPE_AFTERSTART: if (pjob->ji_qs.ji_substate >= JOB_SUBSTATE_RUNNING) { /* job already running, setup task to send */ /* release back to child and continue with */ /* registration process */ ptask = set_task(WORK_Immed, 0, post_run_depend, (void *)pjob); if (ptask) append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask); } /* fall through to complete registration */ case JOB_DEPEND_TYPE_AFTERANY: case JOB_DEPEND_TYPE_AFTEROK: case JOB_DEPEND_TYPE_AFTERNOTOK: rc = register_dep(pattr, preq, type, &made); break; case JOB_DEPEND_TYPE_BEFORESTART: case JOB_DEPEND_TYPE_BEFOREANY: case JOB_DEPEND_TYPE_BEFOREOK: case JOB_DEPEND_TYPE_BEFORENOTOK: /* * Check job owner for permission, use the real * job owner, not the sending server's name. */ (void)strcpy(preq->rq_user, preq->rq_ind.rq_register.rq_owner); if (svr_chk_owner(preq, pjob)) { rc = PBSE_PERM; /* not same user */ } else { /* ok owner, see if job has "on" */ pdep = find_depend(JOB_DEPEND_TYPE_ON, pattr); if (pdep == 0) { /* on "on", see if child already registered */ revtype = type ^ (JOB_DEPEND_TYPE_BEFORESTART - JOB_DEPEND_TYPE_AFTERSTART); pdep = find_depend(revtype, pattr); if (pdep == 0) { /* no "on" and no prior - return error */ rc = PBSE_BADDEPEND; } else { pdj = find_dependjob(pdep, preq->rq_ind.rq_register.rq_child); if (pdj) { /* has prior register, update it */ (void)strcpy(pdj->dc_svr, preq->rq_ind.rq_register.rq_svr); } } } else if ((rc=register_dep(pattr, preq, type, &made)) == 0) { if (made) { /* first time registered */ if (--pdep->dp_numexp <= 0) del_depend(pdep); } } } break; default: #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Unknown dep. op: %d", preq->rq_ind.rq_register.rq_op); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ rc = PBSE_IVALREQ; break; } break; /* * Release a dependency so job might run */ case JOB_DEPEND_OP_RELEASE: switch (type) { case JOB_DEPEND_TYPE_BEFORESTART: case JOB_DEPEND_TYPE_BEFOREANY: case JOB_DEPEND_TYPE_BEFOREOK: case JOB_DEPEND_TYPE_BEFORENOTOK: /* predecessor sent release-reduce "on", */ /* see if this job can now run */ type ^= (JOB_DEPEND_TYPE_BEFORESTART - JOB_DEPEND_TYPE_AFTERSTART); if ((pdep = find_depend(type, pattr)) != NULL) { pdj = find_dependjob(pdep, preq->rq_ind.rq_register.rq_child); if (pdj) { del_depend_job(pdj); pattr->at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE; savetype = SAVEJOB_FULLFORCE; (void)sprintf(log_buffer, msg_registerrel, preq->rq_ind.rq_register.rq_child); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); if (GET_NEXT(pdep->dp_jobs) == 0) { /* no more dependencies of this type */ del_depend(pdep); set_depend_hold(pjob, pattr); } break; } #ifdef NAS /* localmod 109 */ sprintf(log_buffer, "Dep.rls. job not found: %d/%s", type, preq->rq_ind.rq_register.rq_child); } else { sprintf(log_buffer, "Dep.rls. type not found: %d", type); #endif /* localmod 109 */ } #ifdef NAS /* localmod 109 */ log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_register.rq_parent, log_buffer); #endif /* localmod 109 */ rc = PBSE_IVALREQ; break; } break; case JOB_DEPEND_OP_READY: rc = PBSE_NOSYNCMSTR; break; case JOB_DEPEND_OP_DELETE: (void)sprintf(log_buffer, msg_registerdel, preq->rq_ind.rq_register.rq_child); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer); job_abt(pjob, log_buffer); break; case JOB_DEPEND_OP_UNREG: unregister_dep(pattr, preq); set_depend_hold(pjob, pattr); break; default: sprintf(log_buffer, msg_illregister, preq->rq_ind.rq_register.rq_parent); log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_INFO, preq->rq_host, log_buffer); rc = PBSE_IVALREQ; break;; } if (rc) { pjob->ji_modified = 0; req_reject(rc, 0, preq); } else { /* If this is an array job, forcibly save it to ensure * dependencies are recorded. */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) savetype = SAVEJOB_FULLFORCE; if (pjob->ji_modified) (void)job_save(pjob, savetype); reply_ack(preq); } return; }
int reply_send( struct batch_request *request) /* I (freed) */ { int rc = 0; int sfds = request->rq_conn; /* socket */ #ifndef PBS_MOM static char *id = "reply_send"; struct work_task *ptask; #endif /* PBS_MOM */ /* determine where the reply should go, remote or local */ if (sfds == PBS_LOCAL_CONNECTION) { #ifndef PBS_MOM /* * reply stays local, find work task and move it to * the immediate list for dispatching. */ ptask = (struct work_task *)GET_NEXT(task_list_event); while (ptask != NULL) { if ((ptask->wt_type == WORK_Deferred_Local) && (ptask->wt_parm1 == (void *)request)) { delete_link(&ptask->wt_linkall); append_link(&task_list_immed, &ptask->wt_linkall, ptask); return(0); } ptask = (struct work_task *)GET_NEXT(ptask->wt_linkall); } /* should have found a task and didn't */ log_err(-1, id, "did not find work task for local request"); #endif /* PBS_MOM */ rc = PBSE_SYSTEM; } else if (sfds >= 0) { /* Otherwise, the reply is to be sent to a remote client */ #ifndef PBS_MOM if (request->rq_noreply != TRUE) { #endif rc = dis_reply_write(sfds, &request->rq_reply); #ifndef PBS_MOM if (LOGLEVEL >= 7) { sprintf(log_buffer, "Reply sent for request type %s on socket %d", reqtype_to_txt(request->rq_type), sfds); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } #ifndef PBS_MOM } #endif #endif } #ifndef PBS_MOM if ((request->rq_type != PBS_BATCH_AsyModifyJob) || (request->rq_noreply == TRUE)) { #endif free_br(request); #ifndef PBS_MOM } #endif return(rc); } /* END reply_send() */
int encode_jobs( pbs_attribute *pattr, /*struct pbs_attribute being encoded */ tlist_head *ph, /*head of a list of "svrattrl" structs which are to be returned*/ char *aname, /*pbs_attribute's name */ char *rname, /*resource's name (null if none) */ int mode, /*mode code, unused here */ int perm) /* only used for resources */ { svrattrl *pal; struct jobinfo *jip; struct pbsnode *pnode; struct pbssubn *psubn; int i; int jobcnt; /*number of jobs using the node */ int strsize; /*computed string size */ char *job_str; /*holds comma separated list of jobs*/ int job_str_len = 0; if (pattr == NULL) { return (-1); } if (!(pattr->at_flags & ATR_VFLAG_SET) || !pattr->at_val.at_jinfo) { return(0); /* nothing to report back */ } /* cnt number of jobs and estimate size of string buffer required */ jobcnt = 0; strsize = 1; /*allow for terminating null char*/ pnode = pattr->at_val.at_jinfo; for (psubn = pnode->nd_psn;psubn != NULL;psubn = psubn->next) { for (jip = psubn->jobs;jip != NULL;jip = jip->next) { jobcnt++; strsize += strlen(jip->jobid) + PCONST_ENCOVERHEAD; } } /* END for (psubn) */ if (jobcnt == 0) { /* no jobs currently on this node */ return(0); } if ((job_str = (char *)calloc(1, strsize+1)) == NULL) { return -(PBSE_SYSTEM); } i = 0; for (psubn = pnode->nd_psn;psubn != NULL;psubn = psubn->next) { for (jip = psubn->jobs;jip != NULL;jip = jip->next) { if (i != 0) strcat(job_str, ", "); else i++; sprintf(job_str + strlen(job_str), "%d/%s", psubn->index, jip->jobid); } } /* END for (psubn) */ job_str_len = strlen(job_str); pal = attrlist_create(aname, rname, job_str_len+1); if (pal == NULL) { free(job_str); return -(PBSE_SYSTEM); } strncpy(pal->al_value, job_str, job_str_len); pal->al_flags = ATR_VFLAG_SET; free(job_str); append_link(ph, &pal->al_link, pal); return(0); /* success */ } /* END encode_jobs() */
int encode_time( pbs_attribute *attr, /* ptr to pbs_attribute (value in attr->at_val.at_long) */ tlist_head *phead, /* head of attrlist list (optional) */ const char *atname, /* pbs_attribute name */ const char *rsname, /* resource name (optional) */ int mode, /* encode mode (not used) */ int perm) /* only used for resources */ { size_t ct; char cvnbuf[ENCODE_TIME_SIZE]; int hr; int min; long n; svrattrl *pal; int sec; char *pv; if ((attr == NULL)||(phead == NULL)) { /* FAILURE */ return(-1); } if (!(attr->at_flags & ATR_VFLAG_SET)) { return(0); } n = attr->at_val.at_long; hr = n / 3600; n = n % 3600; min = n / 60; n = n % 60; sec = n; sprintf(cvnbuf, "%02d:%02d:%02d", hr, min, sec); pv = cvnbuf; pv += strlen(pv); ct = strlen(cvnbuf); pal = attrlist_create(atname, rsname, ct+1); if (pal == NULL) { /* FAILURE */ return(-1); } memcpy(pal->al_value, cvnbuf, ct); pal->al_flags = attr->at_flags; append_link(phead, &pal->al_link, pal); /* SUCCESS */ return(1); }
int status_node( struct pbsnode *pnode, /* ptr to node receiving status query */ struct batch_request *preq, int *bad, /* O */ tlist_head *pstathd) /* head of list to append status to */ { int rc = 0; struct brp_status *pstat; svrattrl *pal; if ((preq->rq_perm & ATR_DFLAG_RDACC) == 0) { return(PBSE_PERM); } /* allocate status sub-structure and fill in header portion */ pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status)); if (pstat == NULL) { return(PBSE_SYSTEM); } pstat->brp_objtype = MGR_OBJ_NODE; strncpy(pstat->brp_objname, pnode->nd_name, sizeof(pstat->brp_objname)-1); CLEAR_LINK(pstat->brp_stlink); CLEAR_HEAD(pstat->brp_attr); /*add this new brp_status structure to the list hanging off*/ /*the request's reply substructure */ append_link(pstathd, &pstat->brp_stlink, pstat); /*point to the list of node-attributes about which we want status*/ /*hang that status information from the brp_attr field for this */ /*brp_status structure */ *bad = 0; /*global variable*/ if (preq->rq_ind.rq_status.rq_attr.ll_struct != NULL) pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); else pal = NULL; rc = status_nodeattrib( pal, node_attr_def, pnode, ND_ATR_LAST, preq->rq_perm, &pstat->brp_attr, bad); return(rc); } /* END status_node() */
/* array_recov reads in an array struct saved to disk and inserts it into the servers list of arrays */ int array_recov( char *path, job_array **new_pa) { extern tlist_head svr_jobarrays; job_array *pa; array_request_node *rn; int fd; int old_version; int num_tokens; int i; int len; int rc; old_version = ARRAY_QS_STRUCT_VERSION; /* allocate the storage for the struct */ pa = (job_array*)calloc(1,sizeof(job_array)); if (pa == NULL) { return PBSE_SYSTEM; } /* initialize the linked list nodes */ CLEAR_LINK(pa->all_arrays); CLEAR_HEAD(pa->request_tokens); fd = open(path, O_RDONLY, 0); if ( array_259_upgrade ) { rc = read_and_convert_259_array(fd, pa, path); if(rc != PBSE_NONE) { free(pa); close(fd); return rc; } } else { /* read the file into the struct previously allocated. */ len = read(fd, &(pa->ai_qs), sizeof(pa->ai_qs)); if ((len < 0) || ((len < (int)sizeof(pa->ai_qs)) && (pa->ai_qs.struct_version == ARRAY_QS_STRUCT_VERSION))) { sprintf(log_buffer, "error reading %s", path); log_err(errno, "array_recov", log_buffer); free(pa); close(fd); return PBSE_SYSTEM; } if (pa->ai_qs.struct_version != ARRAY_QS_STRUCT_VERSION) { rc = array_upgrade(pa, fd, pa->ai_qs.struct_version, &old_version); if(rc) { sprintf(log_buffer, "Cannot upgrade array version %d to %d", pa->ai_qs.struct_version, ARRAY_QS_STRUCT_VERSION); log_err(errno, "array_recov", log_buffer); free(pa); close(fd); return rc; } } } pa->jobs = malloc(sizeof(job *) * pa->ai_qs.array_size); memset(pa->jobs,0,sizeof(job *) * pa->ai_qs.array_size); /* check to see if there is any additional info saved in the array file */ /* check if there are any array request tokens that haven't been fully processed */ if (old_version > 1) { if (read(fd, &num_tokens, sizeof(int)) != sizeof(int)) { sprintf(log_buffer, "error reading token count from %s", path); log_err(errno, "array_recov", log_buffer); free(pa); close(fd); return PBSE_SYSTEM; } for (i = 0; i < num_tokens; i++) { rn = (array_request_node*)malloc(sizeof(array_request_node)); if (read(fd, rn, sizeof(array_request_node)) != sizeof(array_request_node)) { sprintf(log_buffer, "error reading array_request_node from %s", path); log_err(errno, "array_recov", log_buffer); free(rn); for (rn = (array_request_node*)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node*)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } free(pa); close(fd); return PBSE_SYSTEM; } CLEAR_LINK(rn->request_tokens_link); append_link(&pa->request_tokens, &rn->request_tokens_link, (void*)rn); } } close(fd); CLEAR_HEAD(pa->ai_qs.deps); if (old_version != ARRAY_QS_STRUCT_VERSION) { /* resave the array struct if the version on disk is older than the current */ array_save(pa); } /* link the struct into the servers list of job arrays */ append_link(&svr_jobarrays, &pa->all_arrays, (void*)pa); *new_pa = pa; return PBSE_NONE; }
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter; time_t time_now = time(NULL); long poll_jobs = 0; char job_id[PBS_MAXSVRJOBID+1]; int job_substate = -1; time_t job_momstattime = -1; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } iter = -1; get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (!poll_jobs) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = svr_find_job(cntl->sc_jobid, FALSE); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); } else if (type == tjstQueue) { pjob = next_job(cntl->sc_pque->qu_jobs,&iter); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } } job_array_index++; } } else { pjob = next_job(&alljobs,&iter); } } /* END if (pjob == NULL) */ else { strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); break; } } } } else pjob = next_job(&alljobs,&iter); } if (pjob == NULL) break; strcpy(job_id, pjob->ji_qs.ji_jobid); job_substate = pjob->ji_qs.ji_substate; job_momstattime = pjob->ji_momstat; strcpy(cntl->sc_jobid, job_id); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); pjob = NULL; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((job_substate == JOB_SUBSTATE_RUNNING) && ((time_now - job_momstattime) > JobStatRate)) { /* go to MOM for status */ if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC) break; if (rc != 0) { pjob = svr_find_job(job_id, FALSE); rc = 0; continue; } if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return; /* will pick up after mom replies */ } } /* END while(1) */ if (rc != 0) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; int iter = -1; /* loop through all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); iter = -1; while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); reply_send_svr(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; if (pque->qu_qs.qu_type != QTYPE_Execution) { unlock_queue(pque, __func__, "not exec", LOGLEVEL); goto nextjob; } unlock_queue(pque, __func__, "exec", LOGLEVEL); } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); rc = 0; } /* END while (pjob != NULL) */ if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int setup_array_struct(job *pjob) { job_array *pa; /* struct work_task *wt; */ array_request_node *rn; int bad_token_count; int array_size; int rc; /* setup a link to this job array in the servers all_arrays list */ pa = (job_array *)calloc(1,sizeof(job_array)); pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; pa->template_job = pjob; /*pa->ai_qs.array_size = pjob->ji_wattr[(int)JOB_ATR_job_array_size].at_val.at_long;*/ strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid); strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix); strncpy(pa->ai_qs.owner, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, PBS_MAXUSER + PBS_MAXSERVERNAME + 2); strncpy(pa->ai_qs.submit_host, get_variable(pjob, pbs_o_host), PBS_MAXSERVERNAME); pa->ai_qs.num_cloned = 0; CLEAR_LINK(pa->all_arrays); CLEAR_HEAD(pa->request_tokens); append_link(&svr_jobarrays, &pa->all_arrays, (void*)pa); if (job_save(pjob, SAVEJOB_FULL, 0) != 0) { job_purge(pjob); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", "cannot save job"); } return 1; } if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa))) { array_delete(pa); snprintf(log_buffer,sizeof(log_buffer), "Array %s requested a slot limit above the max limit %ld, rejecting\n", pa->ai_qs.parent_id, server.sv_attr[SRV_ATR_MaxSlotLimit].at_val.at_long); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buffer); return(INVALID_SLOT_LIMIT); } pa->ai_qs.jobs_running = 0; pa->ai_qs.num_started = 0; pa->ai_qs.num_failed = 0; pa->ai_qs.num_successful = 0; bad_token_count = parse_array_request(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, &(pa->request_tokens)); /* get the number of elements that should be allocated in the array */ rn = (array_request_node *)GET_NEXT(pa->request_tokens); array_size = 0; pa->ai_qs.num_jobs = 0; while (rn != NULL) { if (rn->end > array_size) array_size = rn->end; /* calculate the actual number of jobs (different from array size) */ pa->ai_qs.num_jobs += rn->end - rn->start + 1; rn = (array_request_node *)GET_NEXT(rn->request_tokens_link); } /* size of array is the biggest index + 1 */ array_size++; if (server.sv_attr[SRV_ATR_MaxArraySize].at_flags & ATR_VFLAG_SET) { int max_array_size = server.sv_attr[SRV_ATR_MaxArraySize].at_val.at_long; if (max_array_size < pa->ai_qs.num_jobs) { array_delete(pa); return(ARRAY_TOO_LARGE); } } /* initialize the array */ pa->jobs = malloc(array_size * sizeof(job *)); memset(pa->jobs,0,array_size * sizeof(job *)); /* remember array_size */ pa->ai_qs.array_size = array_size; CLEAR_HEAD(pa->ai_qs.deps); array_save(pa); if (bad_token_count > 0) { array_delete(pa); return 2; } return 0; }
int copy_batchrequest( struct batch_request **newreq, struct batch_request *preq, int type, int jobid) { struct batch_request *request; svrattrl *pal = NULL; svrattrl *newpal = NULL; tlist_head *phead = NULL; char *ptr1; char *ptr2; char newjobname[PBS_MAXSVRJOBID+1]; request = alloc_br(type); if (request) { request->rq_type = preq->rq_type; request->rq_perm = preq->rq_perm; request->rq_fromsvr = preq->rq_fromsvr; request->rq_conn = preq->rq_conn; request->rq_orgconn = preq->rq_orgconn; request->rq_extsz = preq->rq_extsz; request->rq_time = preq->rq_time; strcpy(request->rq_user, preq->rq_user); strcpy(request->rq_host, preq->rq_host); request->rq_reply.brp_choice = preq->rq_reply.brp_choice; request->rq_noreply = preq->rq_noreply; /* we need to copy rq_extend if there is any data */ if (preq->rq_extend) { request->rq_extend = (char *)calloc(1, strlen(preq->rq_extend) + 1); if (request->rq_extend == NULL) { free_br(request); return(PBSE_SYSTEM); } strcpy(request->rq_extend, preq->rq_extend); } /* remember the batch_request we copied */ request->rq_extra = (void *)preq; switch(preq->rq_type) { /* This function was created for a modify arracy request (PBS_BATCH_ModifyJob) the preq->rq_ind structure was allocated in dis_request_read. If other BATCH types are needed refer to that function to see how the rq_ind structure was allocated and then copy it here. */ case PBS_BATCH_DeleteJob: case PBS_BATCH_HoldJob: case PBS_BATCH_CheckpointJob: case PBS_BATCH_ModifyJob: case PBS_BATCH_AsyModifyJob: /* based on how decode_DIS_Manage allocates data */ CLEAR_HEAD(request->rq_ind.rq_manager.rq_attr); phead = &request->rq_ind.rq_manager.rq_attr; request->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd; request->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype; /* If this is a job array it is possible we only have the array name and not the individual job. We need to find out what we have and modify the name if needed */ ptr1 = strstr(preq->rq_ind.rq_manager.rq_objname, "[]"); if ((ptr1) && (jobid != -1)) { ptr1++; strcpy(newjobname, preq->rq_ind.rq_manager.rq_objname); ptr2 = strstr(newjobname, "[]"); ptr2++; *ptr2 = 0; sprintf(request->rq_ind.rq_manager.rq_objname,"%s%d%s", newjobname, jobid, ptr1); } else strcpy(request->rq_ind.rq_manager.rq_objname, preq->rq_ind.rq_manager.rq_objname); /* copy the attribute list */ pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr); while(pal != NULL) { newpal = (svrattrl *)calloc(1, pal->al_tsize + 1); if (!newpal) { free_br(request); return(PBSE_SYSTEM); } CLEAR_LINK(newpal->al_link); newpal->al_atopl.next = 0; newpal->al_tsize = pal->al_tsize + 1; newpal->al_nameln = pal->al_nameln; newpal->al_flags = pal->al_flags; newpal->al_atopl.name = (char *)newpal + sizeof(svrattrl); strcpy(newpal->al_atopl.name, pal->al_atopl.name); newpal->al_nameln = pal->al_nameln; newpal->al_atopl.resource = newpal->al_atopl.name + newpal->al_nameln; if (pal->al_atopl.resource != NULL) strcpy(newpal->al_atopl.resource, pal->al_atopl.resource); newpal->al_rescln = pal->al_rescln; newpal->al_atopl.value = newpal->al_atopl.name + newpal->al_nameln + newpal->al_rescln; strcpy(newpal->al_atopl.value, pal->al_atopl.value); newpal->al_valln = pal->al_valln; newpal->al_atopl.op = pal->al_atopl.op; pal = (struct svrattrl *)GET_NEXT(pal->al_link); } break; case PBS_BATCH_SignalJob: strcpy(request->rq_ind.rq_signal.rq_jid, preq->rq_ind.rq_signal.rq_jid); strcpy(request->rq_ind.rq_signal.rq_signame, preq->rq_ind.rq_signal.rq_signame); request->rq_extra = strdup(preq->rq_extra); break; case PBS_BATCH_MessJob: strcpy(request->rq_ind.rq_message.rq_jid, preq->rq_ind.rq_message.rq_jid); request->rq_ind.rq_message.rq_file = preq->rq_ind.rq_message.rq_file; strcpy(request->rq_ind.rq_message.rq_text, preq->rq_ind.rq_message.rq_text); break; default: break; } if ((phead != NULL) && (newpal != NULL)) append_link(phead, &newpal->al_link, newpal); *newreq = request; return(0); } else return(PBSE_SYSTEM); }
static int parse_array_request(char *request, tlist_head *tl) { char *temp_str; int num_tokens; char **tokens; int i; int j; int num_elements; int start; int end; int num_bad_tokens; int searching; array_request_node *rn; array_request_node *rn2; temp_str = strdup(request); num_tokens = array_request_token_count(request); num_bad_tokens = 0; tokens = (char**)malloc(sizeof(char*) * num_tokens); j = num_tokens - 1; /* start from back and scan backwards setting pointers to tokens and changing ',' to '\0' */ for (i = strlen(temp_str) - 1; i >= 0; i--) { if (temp_str[i] == ',') { tokens[j--] = &temp_str[i+1]; temp_str[i] = '\0'; } else if (i == 0) { tokens[0] = temp_str; } } for (i = 0; i < num_tokens; i++) { num_elements = array_request_parse_token(tokens[i], &start, &end); if (num_elements == 0) { num_bad_tokens++; } else { rn = (array_request_node*)malloc(sizeof(array_request_node)); rn->start = start; rn->end = end; CLEAR_LINK(rn->request_tokens_link); rn2 = GET_NEXT(*tl); searching = TRUE; while (searching) { if (rn2 == NULL) { append_link(tl, &rn->request_tokens_link, (void*)rn); searching = FALSE; } else if (rn->start < rn2->start) { insert_link(&rn2->request_tokens_link, &rn->request_tokens_link, (void*)rn, LINK_INSET_BEFORE); searching = FALSE; } else { rn2 = GET_NEXT(rn2->request_tokens_link); } } rn2 = GET_PRIOR(rn->request_tokens_link); if (rn2 != NULL && rn2->end >= rn->start) { num_bad_tokens++; } rn2 = GET_NEXT(rn->request_tokens_link); if (rn2 != NULL && rn2->start <= rn->end) { num_bad_tokens++; } } } free(tokens); free(temp_str); return num_bad_tokens; }
int status_job( job *pjob, /* ptr to job to status */ batch_request *preq, svrattrl *pal, /* specific attributes to status */ tlist_head *pstathd, /* RETURN: head of list to append status to */ bool condensed, int *bad) /* RETURN: index of first bad pbs_attribute */ { struct brp_status *pstat; int IsOwner = 0; long query_others = 0; long condensed_timeout = JOB_CONDENSED_TIMEOUT; /* Make sure procct is removed from the job resource attributes */ remove_procct(pjob); /* see if the client is authorized to status this job */ if (svr_authorize_jobreq(preq, pjob) == 0) IsOwner = 1; get_svr_attr_l(SRV_ATR_query_others, &query_others); if (!query_others) { if (IsOwner == 0) { return(PBSE_PERM); } } get_svr_attr_l(SRV_ATR_job_full_report_time, &condensed_timeout); // if the job has been modified within the timeout, send the full output if ((condensed == true) && (time(NULL) < pjob->ji_mod_time + condensed_timeout)) condensed = false; /* allocate reply structure and fill in header portion */ if ((pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status))) == NULL) { return(PBSE_SYSTEM); } CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; if (status_attrib( pal, job_attr_def, pjob->ji_wattr, JOB_ATR_LAST, preq->rq_perm, &pstat->brp_attr, condensed, bad, IsOwner)) { return(PBSE_NOATTR); } return (0); } /* END status_job() */
int status_job( job *pjob, /* ptr to job to status */ struct batch_request *preq, svrattrl *pal, /* specific attributes to status */ tlist_head *pstathd, /* RETURN: head of list to append status to */ int *bad) /* RETURN: index of first bad attribute */ { struct brp_status *pstat; int IsOwner = 0; /* see if the client is authorized to status this job */ if (svr_authorize_jobreq(preq, pjob) == 0) IsOwner = 1; if (!server.sv_attr[SRV_ATR_query_others].at_val.at_long) { if (IsOwner == 0) { return(PBSE_PERM); } } /* allocate reply structure and fill in header portion */ pstat = (struct brp_status *)malloc(sizeof(struct brp_status)); if (pstat == NULL) { return(PBSE_SYSTEM); } memset(pstat, 0, sizeof(struct brp_status)); CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; if (status_attrib( pal, job_attr_def, pjob->ji_wattr, JOB_ATR_LAST, preq->rq_perm, &pstat->brp_attr, bad, IsOwner)) { return(PBSE_NOATTR); } return (0); } /* END status_job() */