int set_proc_ct( resource *pprocsp, /* I */ pbs_attribute *pattr, /* I */ int actmode) /* I */ { resource *pnodesp; resource_def *pndef; resource *ppct; resource_def *ppdef; if (actmode == ATR_ACTION_RECOV) { /* SUCCESS */ return(0); } /* set "procct" to count of processors in "nodes" plus "procs" */ ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size); if (ppdef == NULL) { return(PBSE_SYSTEM); } if ((ppct = find_resc_entry(pattr, ppdef)) == NULL) { if ((ppct = add_resource_entry(pattr, ppdef)) == 0) { return(PBSE_SYSTEM); } } pndef = find_resc_def(svr_resc_def, "nodes", svr_resc_size); if (pndef == NULL) { return(PBSE_SYSTEM); } if ((pnodesp = find_resc_entry(pattr, pndef)) == NULL) { ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long; } else { ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long; count_proc(pnodesp->rs_value.at_val.at_str); } ppct->rs_value.at_flags |= ATR_VFLAG_SET; return(0); } /* END set_proc_ct() */
void set_resc_assigned( job *pjob, /* I */ enum batch_op op) /* INCR or DECR */ { resource *jobrsc; resource *pr; pbs_attribute *queru; resource_def *rscdef; pbs_attribute *sysru; pbs_queue *pque; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((pjob == NULL)) return; if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pque->qu_qs.qu_type == QTYPE_Execution) { if (op == DECR) { /* if freeing completed job resources, ignore constraint (???) */ /* NO-OP */ } } else { snprintf(log_buf,sizeof(log_buf), "job %s isn't in an execution queue, can't modify resources\njob is in queue %s", pjob->ji_qs.ji_jobid, pque->qu_qs.qu_name); log_err(-1, __func__, log_buf); unlock_queue(pque, __func__, NULL, LOGLEVEL); return; } if (op == INCR) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* already added in */ } pjob->ji_qs.ji_svrflags |= JOB_SVFLG_RescAssn; } else if (op == DECR) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) == 0) { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* not currently included */ } pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn; } else { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* invalid op */ } sysru = &server.sv_attr[SRV_ATR_resource_assn]; queru = &pque->qu_attr[QE_ATR_ResourceAssn]; jobrsc = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list); while (jobrsc != NULL) { rscdef = jobrsc->rs_defin; /* if resource usage is to be tracked */ if ((rscdef->rs_flags & ATR_DFLAG_RASSN) && (jobrsc->rs_value.at_flags & ATR_VFLAG_SET)) { /* update system pbs_attribute of resources assigned */ pr = find_resc_entry(sysru, rscdef); if (pr == NULL) { pr = add_resource_entry(sysru, rscdef); if (pr == NULL) { unlock_queue(pque, __func__, "sysru", LOGLEVEL); return; } } rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op); /* update queue pbs_attribute of resources assigned */ pr = find_resc_entry(queru, rscdef); if (pr == NULL) { pr = add_resource_entry(queru, rscdef); if (pr == NULL) { unlock_queue(pque, __func__, "queru", LOGLEVEL); return; } } rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op); } jobrsc = (resource *)GET_NEXT(jobrsc->rs_link); } /* END while (jobrsc != NULL) */ unlock_queue(pque, __func__, "success", LOGLEVEL); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 9"); } return; } /* END set_resc_assigned() */
int set_mppnodect( resource * UNUSED(res), pbs_attribute *attr, int UNUSED(op)) { int width; int nppn; int nodect; int have_mppwidth = 0; int have_mppnppn = 0; resource_def *pdef; resource *pent = NULL; /* Go find the currently known width, nppn attributes */ width = 0; nppn = 0; if (((pdef = find_resc_def(svr_resc_def,"mppwidth",svr_resc_size))) && ((pent = find_resc_entry(attr,pdef)))) { width = pent->rs_value.at_val.at_long; have_mppwidth = 1; } if (((pdef = find_resc_def(svr_resc_def,"mppnppn",svr_resc_size))) && ((pent = find_resc_entry(attr,pdef)))) { nppn = pent->rs_value.at_val.at_long; have_mppnppn = 1; /* Check for width less than a node */ if ((width) && (width < nppn)) { nppn = width; pent->rs_value.at_val.at_long = nppn; pent->rs_value.at_flags |= ATR_VFLAG_SET; } } /* Compute an estimate for the number of nodes needed */ nodect = width; if (nppn > 1) { nodect = (nodect + nppn - 1) / nppn; } /* Find or create the "mppnodect" pbs_attribute entry */ if ((pdef = find_resc_def(svr_resc_def,"mppnodect",svr_resc_size))) { if (((pent = find_resc_entry(attr,pdef)) == NULL) && ((pent = add_resource_entry(attr,pdef)) == NULL)) { return(PBSE_SYSTEM); } } else return(PBSE_SYSTEM); /* Update the value */ if (!have_mppwidth || !have_mppnppn) { pent->rs_value.at_val.at_long = -1; } else { pent->rs_value.at_val.at_long = nodect; } pent->rs_value.at_flags |= ATR_VFLAG_SET; return(PBSE_NONE); } /* END set_mppnodect() */
int set_node_ct( resource *pnodesp, /* I */ pbs_attribute *pattr, /* I */ int actmode) /* I */ { resource *pnct; resource_def *pndef; resource *ppct = NULL; resource_def *ppdef; resource *pprocsp; resource_def *pprocsdef; if (actmode == ATR_ACTION_RECOV) { /* SUCCESS */ return(0); } /* Set "nodect" to count of nodes in "nodes" */ pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size); if (pndef == NULL) { return(PBSE_SYSTEM); } if ((pnct = find_resc_entry(pattr, pndef)) == NULL) { if ((pnct = add_resource_entry(pattr, pndef)) == NULL) { return(PBSE_SYSTEM); } } pnct->rs_value.at_val.at_long = ctnodes(pnodesp->rs_value.at_val.at_str); pnct->rs_value.at_flags |= ATR_VFLAG_SET; /* Set "neednodes" to "nodes", may be altered by scheduler */ pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size); if (pndef == NULL) { return(PBSE_SYSTEM); } if ((pnct = find_resc_entry(pattr, pndef)) == NULL) { if ((pnct = add_resource_entry(pattr, pndef)) == NULL) { return(PBSE_SYSTEM); } } else { pndef->rs_free(&pnct->rs_value); } pndef->rs_decode(&pnct->rs_value, NULL, NULL, pnodesp->rs_value.at_val.at_str, ATR_DFLAG_ACCESS); pnct->rs_value.at_flags |= ATR_VFLAG_SET; /* SUCCESS nodect */ /* set "procct" to count of processors in "nodes" plus "procs" */ ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size); if (ppdef == NULL) { return(PBSE_SYSTEM); } if ((ppct = find_resc_entry(pattr, ppdef)) == NULL) { if ((ppct = add_resource_entry(pattr, ppdef)) == 0) { return(PBSE_SYSTEM); } } pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size); if (pprocsdef == NULL) { return(PBSE_SYSTEM); } if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL) { ppct->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str); } else { ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long; ppct->rs_value.at_val.at_long += count_proc(pnodesp->rs_value.at_val.at_str); } ppct->rs_value.at_flags |= ATR_VFLAG_SET; /* SUCCESS procct */ return(0); } /* END set_node_ct() */
/* int initialize_procct - set pjob->procct plus the resource * procct in the Resource_List * * Assumes the nodes resource has been set on the Resource_List. This should * have been done in req_quejob with the set_nodes_attr() function or in * set_node_ct and/or set_proc_ct. * * Returns 0 on success. Non-zero on failure */ int initialize_procct(job *pjob) { char id[] = "initialize_procct"; resource *pnodesp = NULL; resource_def *pnodes_def = NULL; resource *pprocsp = NULL; resource_def *pprocs_def = NULL; resource *procctp = NULL; resource_def *procct_def = NULL; pbs_attribute *pattr = NULL; pattr = &pjob->ji_wattr[JOB_ATR_resource]; if(pattr == NULL) { /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set by the time this function is called */ sprintf(log_buffer, "%s: Resource_List is NULL. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } /* Has nodes been initialzed */ if(pattr->at_flags & ATR_VFLAG_SET) { /* get the node spec from the nodes resource */ pnodes_def = find_resc_def(svr_resc_def, "nodes", svr_resc_size); if(pnodes_def == NULL) { sprintf(log_buffer, "%s: Could not get nodes resource definition. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } pnodesp = find_resc_entry(pattr, pnodes_def); /* Get the procs count if the procs resource attribute is set */ pprocs_def = find_resc_def(svr_resc_def, "procs", svr_resc_size); if(pprocs_def != NULL) { /* if pprocs_def is NULL we just go on. Otherwise we will get its value now */ pprocsp = find_resc_entry(pattr, pprocs_def); /* We will evaluate pprocsp later. If it is null we do not care */ } /* if neither pnodesp nor pprocsp are set, terminate */ if(pnodesp == NULL && pprocsp == NULL) { /* nodes and procs were not set. Hopefully req_quejob set procct to 1 for us already */ procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size); if(procct_def == NULL) { sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } procctp = find_resc_entry(pattr, procct_def); if(procctp == NULL) { sprintf(log_buffer, "%s: Could not get nodes nor procs entry from Resource_List. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } } /* we now set pjob->procct and we also set the resource attribute procct */ procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size); if(procct_def == NULL) { sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } procctp = find_resc_entry(pattr, procct_def); if(procctp == NULL) { procctp = add_resource_entry(pattr, procct_def); if(procctp == NULL) { sprintf(log_buffer, "%s: Could not add procct resource. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } } /* Finally the moment of truth. We have the nodes and procs resources. Add them to the procct resoruce*/ procctp->rs_value.at_val.at_long = 0; if(pnodesp != NULL) { procctp->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str); } if(pprocsp != NULL) { procctp->rs_value.at_val.at_long += pprocsp->rs_value.at_val.at_long; } procctp->rs_value.at_flags |= ATR_VFLAG_SET; } else { /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set by the time this function is called */ sprintf(log_buffer, "%s: Resource_List not set. Cannot proceed", id); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pbs_errno = PBSE_INTERNAL; return(ROUTE_PERM_FAILURE); } return(PBSE_NONE); } /* END initialize_procct */
/** * @brief * Update the job attribute for resources used. * * The first time this is called for a job, set up resource entries for * each resource that can be reported for this machine. Fill in the * correct values. Return an error code. * * Assumes that the session ID attribute has already been set. * * @return int * @retval PBSE_NONE for success. */ int mom_set_use(job *pjob) { resource *pres; attribute *at; resource_def *rd; u_Long *lp_sz, lnum_sz; unsigned long *lp, lnum, oldcput; long dur; long ncpus_req; assert(pjob != NULL); at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) != 0) return (PBSE_NONE); /* job suspended, don't track it */ DBPRT(("%s: entered %s\n", __func__, pjob->ji_qs.ji_jobid)) at->at_flags |= ATR_VFLAG_MODIFY; if ((at->at_flags & ATR_VFLAG_SET) == 0) { at->at_flags |= ATR_VFLAG_SET; rd = find_resc_def(svr_resc_def, "ncpus", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; /* * get pointer to list of resources *requested* for the job * so the ncpus used can be set to ncpus requested */ at_req = &pjob->ji_wattr[(int)JOB_ATR_resource]; assert(at->at_type == ATR_TYPE_RESC); pres_req = find_resc_entry(at_req, rd); if ((pres_req != NULL) && ((ncpus_req=pres_req->rs_value.at_val.at_long) !=0)) pres->rs_value.at_val.at_long = ncpus_req; else pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; } rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (unsigned long *)&pres->rs_value.at_val.at_long; oldcput = *lp; lnum = MAX(*lp, cput_sum(pjob)); *lp = lnum; /* now calculate weight moving average cpu usage percentage */ if ((dur = sampletime_ceil+1 - pjob->ji_sampletim) > PBS_MIN_CPUPERCENT_PERIOD) { calc_cpupercent(pjob, oldcput, lnum, dur, at); } pjob->ji_sampletim = sampletime_floor; rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp_sz = &pres->rs_value.at_val.at_size.atsv_num; lnum_sz = (mem_sum(pjob) + 1023) >> 10; /* as KB */ *lp_sz = MAX(*lp_sz, lnum_sz); rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp_sz = &pres->rs_value.at_val.at_size.atsv_num; lnum_sz = (resi_sum(pjob) + 1023) >> 10; /* in KB */ *lp_sz = MAX(*lp_sz, lnum_sz); return (PBSE_NONE); }
int set_node_ct( resource *pnodesp, /* I */ pbs_attribute *pattr, /* I */ int actmode) /* I */ { resource *pnct; resource_def *pndef; resource *ppct = NULL; resource_def *ppdef; resource *pprocsp; resource_def *pprocsdef; if (actmode == ATR_ACTION_RECOV) { /* SUCCESS */ return(0); } // WARNING: we are potentially re-sizing the vector in the calls to add_resource_entry() // below. All attempts to use pnodesp after the calls to add_resource_entry may be using an // invalid pointer, so copy data here std::string nodes_val(pnodesp->rs_value.at_val.at_str); /* Set "nodect" to count of nodes in "nodes" */ pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size); if (pndef == NULL) { return(PBSE_SYSTEM); } if ((pnct = find_resc_entry(pattr, pndef)) == NULL) { if ((pnct = add_resource_entry(pattr, pndef)) == NULL) { return(PBSE_SYSTEM); } } pnct->rs_value.at_val.at_long = ctnodes(nodes_val.c_str()); pnct->rs_value.at_flags |= ATR_VFLAG_SET; /* Set "neednodes" to "nodes", may be altered by scheduler */ pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size); if (pndef == NULL) { return(PBSE_SYSTEM); } if ((pnct = find_resc_entry(pattr, pndef)) == NULL) { if ((pnct = add_resource_entry(pattr, pndef)) == NULL) { return(PBSE_SYSTEM); } } else { pndef->rs_free(&pnct->rs_value); } pndef->rs_decode(&pnct->rs_value, NULL, NULL, nodes_val.c_str(), ATR_DFLAG_ACCESS); pnct->rs_value.at_flags |= ATR_VFLAG_SET; /* SUCCESS nodect */ /* set "procct" to count of processors in "nodes" plus "procs" */ ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size); if (ppdef == NULL) { return(PBSE_SYSTEM); } if ((ppct = find_resc_entry(pattr, ppdef)) == NULL) { if ((ppct = add_resource_entry(pattr, ppdef)) == 0) { return(PBSE_SYSTEM); } } pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size); if (pprocsdef == NULL) { return(PBSE_SYSTEM); } if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL) { ppct->rs_value.at_val.at_long = count_proc(nodes_val.c_str()); } else { ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long; ppct->rs_value.at_val.at_long += count_proc(nodes_val.c_str()); } ppct->rs_value.at_flags |= ATR_VFLAG_SET; /* SUCCESS procct */ return(0); } /* END set_node_ct() */
static job *chk_job_torun( struct batch_request *preq, /* I */ int setnn) /* I */ { static char *id = "chk_job_torun"; job *pjob; struct rq_runjob *prun; int rc; char EMsg[1024]; char FailHost[1024]; char exec_host[1024]; char *ptr; prun = &preq->rq_ind.rq_run; if ((pjob = chk_job_request(prun->rq_jid, preq)) == 0) { /* FAILURE */ return(NULL); } if ((pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) || (pjob->ji_qs.ji_state == JOB_STATE_EXITING) || (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) || (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) || (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)) { /* FAILURE - job already started */ req_reject(PBSE_BADSTATE, 0, preq, NULL, "job already running"); return(NULL); } if (preq->rq_type == PBS_BATCH_StageIn) { if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEIN) { /* FAILURE */ req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); return(NULL); } } if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) { /* FAILURE - run request not authorized */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); return(NULL); } if (pjob->ji_qhdr->qu_qs.qu_type != QTYPE_Execution) { /* FAILURE - job must be in execution queue */ log_err(-1, id, "attempt to start job in non-execution queue"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not in execution queue"); return(NULL); } /* where to execute the job */ #ifdef ENABLE_BLCR if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) #else if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_StagedIn)) #endif { /* job has been checkpointed or files already staged in */ /* in this case, exec_host must be already set */ if (prun->rq_destin && *prun->rq_destin) /* If a destination has been specified */ { /* specified destination must match exec_host */ strcpy(exec_host, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str); if ((ptr = strchr(exec_host, '/'))) * ptr = 0; /* For some reason, node name has "/0" on the end (i.e. "node0001/0"). */ if (strcmp(prun->rq_destin, exec_host) != 0) { /* FAILURE */ if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE)) req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match checkpoint location"); else req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match input file stagein location"); return(NULL); } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HasNodes) == 0) { /* re-reserve nodes and leave exec_host as is */ if ((rc = assign_hosts( /* inside chk_job_torun() */ pjob, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str, 0, FailHost, EMsg)) != 0) /* O */ { req_reject(PBSE_EXECTHERE, 0, preq, FailHost, EMsg); return(NULL); } } } /* END if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE|JOB_SVFLG_StagedIn)) */ else { /* make sure exec gpus is clear */ if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { job_attr_def[(int)JOB_ATR_exec_gpus].at_free( &pjob->ji_wattr[JOB_ATR_exec_gpus]); } /* job has not run before or need not run there again */ /* reserve nodes and set new exec_host */ if ((prun->rq_destin == NULL) || (prun->rq_destin[0] == '\0')) { /* it is possible for the scheduler to pass a hostlist using the * rq_extend field--we should use it as the given list * as an alternative to rq_destin */ rc = assign_hosts(pjob, preq->rq_extend, 1, FailHost, EMsg); /* inside chk_job_torun() */ } else { rc = assign_hosts(pjob, prun->rq_destin, 1, FailHost, EMsg); /* inside chk_job_torun() */ } if (rc != 0) { /* FAILURE - cannot essign correct hosts */ req_reject(rc, 0, preq, FailHost, EMsg); return(NULL); } } if (setnn == 1) { #ifdef TDEV /* what should neednodes be set to? */ resource_def *DRes; /* resource definition */ resource *JRes; /* resource on job */ attribute *Attr; /* 'neednodes' attribute */ Attr = &pjob->ji_wattr[(int)JOB_ATR_resource]; DRes = find_resc_def(svr_resc_def, "neednodes", svr_resc_size); JRes = find_resc_entry(Attr, DRes); if ((JRes == NULL) || ((JRes->rs_value.at_flags & ATR_VFLAG_SET) == 0)) { /* resource does not exist or value is not set */ if (JRes == NULL) { JRes = add_resource_entry(Attr, DRes); } if (JRes != NULL) { if (DRes->rs_defin->rs_set( &JRes->rs_value, &DRes->rs_value, SET) == 0) { JRes->rs_value.at_flags |= ATR_VFLAG_SET; } } } #endif /* TDEV */ } /* END if (setnn == 1) */ return(pjob); } /* END chk_job_torun() */
int CPACreatePartition( job *pjob, /* I */ struct var_table *vtab) /* I */ { char id[] = "CPACreatePartition"; cpa_node_req_t *NodeReq; int rc; char *Value; char *Spec; int PPN; int Flags; int Size = 0; int UID; char *AcctID = NULL; char *JobID; char *HostList = NULL; /* scheduler specified list of hosts to allocate (optional) */ unsigned long ParID; /* O - partition id */ unsigned long long AdminCookie; /* O - admin cookie */ unsigned long long AllocCookie; /* O - alloc cookie */ char longbuf[1024]; resource *presc; /* Requested Resource List */ resource_def *prd; attribute *pattr; int rc; cpa_nid_list_t Wanted = NULL; /* first, get the size, uid, jobid, and subnodelist from the job */ pattr = &pjob->ji_wattr[JOB_ATR_resource]; prd = find_resc_def(svr_resc_def, "size", svr_resc_size); presc = find_resc_entry(pattr, prd); if (presc != NULL) { Size = presc->rs_value.at_val.at_long; } UID = pjob->ji_qs.ji_un.ji_momt.ji_exuid; if ((Size <= 0) || (UID < 0)) { /* FAILURE */ sprintf(log_buffer, "ERROR: invalid parameters: Size: %d UID: %d \n", Size, UID); log_err(-1, id, log_buffer); return(1); } pattr = &pjob->ji_wattr[JOB_ATR_resource]; prd = find_resc_def(svr_resc_def, "subnode_list", svr_resc_size); presc = find_resc_entry(pattr, prd); if (presc != NULL) { HostList = presc->rs_value.at_val.at_string; } if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET) { AcctID = pjob->ji_wattr[JOB_ATR_account].at_val.at_str; } JobID = pjob->ji_qs.ji_jobid; PPN = 1; /* NOTE: not really supported w/in CPA, always use 1 */ Flags = 0; /* NOTE: only allocate compute hosts, always use 0 */ Spec = NULL; /* NOTE: required node specification, not used */ if (HostList != NULL) { char tmpBuffer[256000]; int index; rc = nid_list_create( 0, MaxListSize, /* max count */ 0, MaxNID, /* max value */ &Wanted); /* O */ if (rc != 0) { /* FAILURE */ printf("nid_list_create: rc=%d (%s)\n", rc, cpa_rc2str(rc)); return(1); } strncpy(tmpBuffer, HostList, sizeof(tmpBuffer)); tmpBuffer[sizeof(tmpBuffer) - 1] = '\0'; for (index = 0;tmpBuffer[index] != '\0';index++) { if (tmpBuffer[index] == ':') tmpBuffer[index] = ','; } rc = nid_list_destringify(tmpBuffer, Wanted); if (rc != 0) { /* FAILURE */ printf("nid_list_destringify: rc=%d (%s)\n", rc, cpa_rc2str(rc)); nid_list_destroy(Wanted); return(1); } if (loglevel >= 3) { char *buf = NULL; int bufsize = 0; rc = nid_list_stringify(Wanted, &buf, &bufsize); if (rc == 0) { snprintf(log_buffer, sizeof(log_buffer), "CPANodeList: %s\n", buf); } else { snprintf(log_buffer, sizeof(log_buffer), "CPA nid_list_stringify: rc=%d\n", rc); } log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); free(buf); } } else { Wanted = NULL; } NodeReq = cpa_new_node_req( Size, /* number of procs/nodes required by job */ PPN, Flags, Spec, Wanted); /* I */ if (NodeReq == NULL) { /* FAILURE: cannot alloc memory for node req */ sprintf(log_buffer, "cpa_new_node_req: NULL\n"); log_err(-1, id, log_buffer); nid_list_destroy(Wanted); return(1); } rc = cpa_create_partition( NodeReq, CPA_BATCH, CPA_NOT_SPECIFIED, UID, (AcctID != NULL) ? AcctID : "DEFAULT", (cpa_partition_id_t *) & ParID, /* O */ (cpa_cookie_t *) & AdminCookie, /* O */ (cpa_cookie_t *) & AllocCookie); /* O */ if (rc != 0) { /* FAILURE */ sprintf(log_buffer, "cpa_create_partition: rc=%d (%s)\n", rc, cpa_rc2str(rc)); log_err(-1, id, log_buffer); nid_list_destroy(Wanted); return(1); } rc = cpa_assign_partition( (cpa_partition_id_t)ParID, (cpa_cookie_t)AdminCookie, JobID, 1); /* NOT CURRENTLY USED - should be set to NID of 'master host' */ /* free memory, nid list no longer required */ nid_list_destroy(Wanted); if (rc != 0) { /* FAILURE */ sprintf(log_buffer, "cpa_assign_partition: rc=%d (%s)\n", rc, cpa_rc2str(rc)); log_err(-1, id, log_buffer); return(1); } /* save the partition and cookies in the job and vtab */ prd = find_resc_def(svr_resc_def, "cpapartition", svr_resc_size); if (prd == NULL) { return(PBSE_SYSTEM); } if ((presc = find_resc_entry(pattr, prd)) == NULL) { if ((presc = add_resource_entry(pattr, prd)) == NULL) { return(PBSE_SYSTEM); } } else { prd->rs_free(&presc->rs_value); } snprintf(longbuf, 1023, "%lu", ParID); prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf); presc->rs_value.at_flags |= ATR_VFLAG_SET; bld_env_variables(vtab, "BATCH_PARTITION_ID", longbuf); prd = find_resc_def(svr_resc_def, "cpaadmincookie", svr_resc_size); if (prd == NULL) { return(PBSE_SYSTEM); } if ((presc = find_resc_entry(pattr, prd)) == NULL) { if ((presc = add_resource_entry(pattr, prd)) == NULL) { return(PBSE_SYSTEM); } } else { prd->rs_free(&presc->rs_value); } snprintf(longbuf, 1023, "%llu", AdminCookie); prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf); presc->rs_value.at_flags |= ATR_VFLAG_SET; /* admincookie doesn't go into job env */ prd = find_resc_def(svr_resc_def, "cpaalloccookie", svr_resc_size); if (prd == NULL) { return(PBSE_SYSTEM); } if ((presc = find_resc_entry(pattr, prd)) == NULL) { if ((presc = add_resource_entry(pattr, prd)) == NULL) { return(PBSE_SYSTEM); } } else { prd->rs_free(&presc->rs_value); } snprintf(longbuf, 1023, "%llu", AllocCookie); prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf); presc->rs_value.at_flags |= ATR_VFLAG_SET; bld_env_variables(vtab, "BATCH_ALLOC_COOKIE", longbuf); bld_env_variables(vtab, "BATCH_JOBID", JobID); return(0); } /* END CPACreatePartition() */
int set_node_ct(resource *pnodesp, attribute *pattr, void *pobj, int type, int actmode) { #ifndef PBS_MOM int nn; /* num of nodes */ int nt; /* num of tasks (processes) */ int hcpp = 0; /* has :ccp in string */ long nc; resource *pnct; resource *pncpus; resource_def *pndef; if ((actmode == ATR_ACTION_RECOV) || ((pnodesp->rs_value.at_flags & ATR_VFLAG_SET) == 0)) return (0); /* first validate the spec */ if ((nn = validate_nodespec(pnodesp->rs_value.at_val.at_str)) != 0) return nn; /* Set "nodect" to count of nodes in "nodes" */ pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size); if (pndef == (resource_def *)0) return (PBSE_SYSTEM); if ((pnct = find_resc_entry(pattr, pndef)) == (resource *)0) { if ((pnct = add_resource_entry(pattr, pndef)) == 0) return (PBSE_SYSTEM); } nn = ctnodes(pnodesp->rs_value.at_val.at_str); pnct->rs_value.at_val.at_long = nn; pnct->rs_value.at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE; /* find the number of cpus specified in the node string */ nt = ctcpus(pnodesp->rs_value.at_val.at_str, &hcpp); /* Is "ncpus" set as a separate resource? */ pndef = find_resc_def(svr_resc_def, "ncpus", svr_resc_size); if (pndef == (resource_def *)0) return (PBSE_SYSTEM); if ((pncpus = find_resc_entry(pattr, pndef)) == (resource *)0) { if ((pncpus = add_resource_entry(pattr, pndef)) == 0) return (PBSE_SYSTEM); } if (((pncpus->rs_value.at_flags & (ATR_VFLAG_SET | ATR_VFLAG_DEFLT)) == ATR_VFLAG_SET) && (actmode == ATR_ACTION_NEW)) { /* ncpus is already set and not a default and new job */ nc = pncpus->rs_value.at_val.at_long; if (hcpp && (nt != pncpus->rs_value.at_val.at_long)) { /* if cpp string specificed, this is an error */ return (PBSE_BADATVAL); } else if ((nc % nt) != 0) { /* ncpus must be multiple of number of tasks */ return (PBSE_BADATVAL); } } else { /* ncpus is not set or not a new job (qalter being done) */ /* force ncpus to the correct thing */ pncpus->rs_value.at_val.at_long = nt; pncpus->rs_value.at_flags |= (ATR_VFLAG_SET|ATR_VFLAG_MODCACHE); } #endif /* not MOM */ return (0); }
int decode_resc( pbs_attribute *patr, /* Modified on Return */ char *name, /* pbs_attribute name */ char *rescn, /* I resource name - is used here */ char *val, /* resource value */ int perm) /* access permissions */ { resource *prsc; resource_def *prdef; int rc = 0; int rv; if (patr == NULL) { return(PBSE_INTERNAL); } if (rescn == NULL) { return(PBSE_UNKRESC); } if (!(patr->at_flags & ATR_VFLAG_SET)) CLEAR_HEAD(patr->at_val.at_list); prdef = find_resc_def(svr_resc_def, rescn, svr_resc_size); if (prdef == NULL) { /* * didn't find resource with matching name, use unknown; * but return PBSE_UNKRESC in case caller doesn`t wish to * accept unknown resources */ rc = PBSE_UNKRESC; prdef = svr_resc_def + (svr_resc_size - 1); } prsc = find_resc_entry(patr, prdef); if (prsc == NULL) /* no current resource entry, add it */ if ((prsc = add_resource_entry(patr, prdef)) == NULL) { return(PBSE_SYSTEM); } /* note special use of ATR_DFLAG_ACCESS, see server/attr_recov() */ if (((prsc->rs_defin->rs_flags & perm & ATR_DFLAG_WRACC) == 0) && (perm != ATR_DFLAG_ACCESS)) { return(PBSE_ATTRRO); } patr->at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY; rv = prdef->rs_decode(&prsc->rs_value, name, rescn, val, perm); if (rv == 0) { /* FAILURE */ return(rc); } /* SUCCESS */ return(rv); }