int status_job( job *pjob, /* ptr to job to status */ struct batch_request *preq, svrattrl *pal, /* specific attributes to status */ tlist_head *pstathd, /* RETURN: head of list to append status to */ int *bad) /* RETURN: index of first bad pbs_attribute */ { struct brp_status *pstat; int IsOwner = 0; long query_others = 0; /* see if the client is authorized to status this job */ if (svr_authorize_jobreq(preq, pjob) == 0) IsOwner = 1; get_svr_attr_l(SRV_ATR_query_others, &query_others); if (!query_others) { if (IsOwner == 0) { return(PBSE_PERM); } } /* allocate reply structure and fill in header portion */ if ((pstat = calloc(1, sizeof(struct brp_status))) == NULL) { return(PBSE_SYSTEM); } CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; if (status_attrib( pal, job_attr_def, pjob->ji_wattr, JOB_ATR_LAST, preq->rq_perm, &pstat->brp_attr, bad, IsOwner)) { return(PBSE_NOATTR); } return (0); } /* END status_job() */
static int forced_jobpurge( job *pjob, struct batch_request *preq) { long owner_purge = FALSE; /* check about possibly purging the job */ if (preq->rq_extend != NULL) { if (!strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { get_svr_attr_l(SRV_ATR_OwnerPurge, &owner_purge); if (((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) != 0) || ((svr_chk_owner(preq, pjob) == 0) && (owner_purge))) { force_purge_work(pjob); return(PURGE_SUCCESS); } else { /* FAILURE */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } } } return(PBSE_NONE); } /* END forced_jobpurge() */
/** * poll_job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); long poll_jobs = 0; long job_stat_rate; free(ptask->wt_mutex); free(ptask); if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); int job_state = -1; job_state = pjob->ji_qs.ji_state; // only do things for running jobs if (job_state == JOB_STATE_RUNNING) { job_mutex.unlock(); get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate); if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate) { get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (poll_jobs) stat_mom_job(job_id); } /* add another task */ set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE); } } free(job_id); } } /* END poll_job_task() */
static void job_delete_nanny( struct work_task *pwt) { job *pjob; char *sigk = "SIGKILL"; char *jobid; struct batch_request *newreq; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long nanny = FALSE; /* short-circuit if nanny isn't enabled */ get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { jobid = (char *)pwt->wt_parm1; if (jobid != NULL) { pjob = svr_find_job(jobid, FALSE); if (pjob != NULL) { sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid); log_err(-1, "job nanny", log_buf); /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL) { strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk); } issue_signal(&pjob, sigk, post_job_delete_nanny, newreq); if (pjob != NULL) { apply_job_delete_nanny(pjob, time_now + 60); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } } else { log_err(ENOMEM, __func__, "Cannot allocate memory"); } } if (pwt->wt_parm1 != NULL) free(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); } /* END job_delete_nanny() */
void post_job_delete_nanny( batch_request *preq_sig) { int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
/** * poll _job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. The check of * SRV_ATR_PollJobs appears to be redundant. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); long poll_jobs = 0; int job_state = -1; if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if ((poll_jobs) && (job_state == JOB_STATE_RUNNING)) { /* we need to throttle the number of outstanding threads are doing job polling. This prevents a problem where pbs_server gets hung waiting on I/O from the mom */ pthread_mutex_lock(poll_job_task_mutex); if (current_poll_job_tasks < max_poll_job_tasks) { current_poll_job_tasks++; pthread_mutex_unlock(poll_job_task_mutex); stat_mom_job(job_id); pthread_mutex_lock(poll_job_task_mutex); current_poll_job_tasks--; } pthread_mutex_unlock(poll_job_task_mutex); /* add another task */ set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE); } } free(job_id); } free(ptask->wt_mutex); free(ptask); } /* END poll_job_task() */
int set_slot_limit( char *request, /* I */ job_array *pa) /* O */ { char *pcnt; long max_limit; /* check for a max slot limit */ if (get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit) != PBSE_NONE) max_limit = NO_SLOT_LIMIT; if ((pcnt = strchr(request,'%')) != NULL) { /* remove '%' from the request, or else it can't be parsed */ while (*pcnt == '%') { *pcnt = '\0'; pcnt++; } /* read the number if one is given */ if (strlen(pcnt) > 0) { pa->ai_qs.slot_limit = atoi(pcnt); if ((max_limit != NO_SLOT_LIMIT) && (max_limit < pa->ai_qs.slot_limit)) { return(INVALID_SLOT_LIMIT); } } else { pa->ai_qs.slot_limit = max_limit; } } else { pa->ai_qs.slot_limit = max_limit; } return(0); } /* END set_slot_limit() */
int apply_job_delete_nanny( struct job *pjob, int delay) /* I */ { enum work_type tasktype; long nanny = FALSE; /* short-circuit if nanny isn't enabled or we have a delete nanny */ get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if ((nanny == FALSE) || (pjob->ji_has_delete_nanny == TRUE)) { return(PBSE_NONE); } if (delay == 0) { tasktype = WORK_Immed; } else if (delay > 0) { tasktype = WORK_Timed; } else { log_err(-1, __func__, "negative delay requested for nanny"); return(-1); } pjob->ji_has_delete_nanny = TRUE; /* add a nanny task at the requested time */ set_task(tasktype, delay, job_delete_nanny, strdup(pjob->ji_qs.ji_jobid), FALSE); return(PBSE_NONE); } /* END apply_job_delete_nanny() */
std::string get_path_jobdata( const char *jobid, const char *basepath) { std::string ret_path(""); long use_jobs_subdirs = FALSE; if ((jobid == NULL) || (basepath == NULL)) return(ret_path); ret_path = basepath; // get use_jobs_subdirs value if set get_svr_attr_l(SRV_ATR_use_jobs_subdirs, &use_jobs_subdirs); // if we are using divided subdirectories in server_priv/{jobs,arrays} // then adjust path if ((use_jobs_subdirs == TRUE) && isdigit(*jobid)) { char *p = (char *)jobid + 1; // point p to the first non-digit in string while (isdigit(*p)) p++; // move back 1 char to the last digit of the job id p--; // append the last digit of the numeric part of the job id on the string ret_path.push_back(*p); // append slash ret_path.push_back('/'); } return(ret_path); }
int can_queue_new_job( char *user_name, job *pjob) { long max_queuable = -1; int can_queue_another = TRUE; unsigned int num_queued = 0; unsigned int num_to_add; get_svr_attr_l(SRV_ATR_MaxUserQueuable, &max_queuable); if (max_queuable >= 0) { num_to_add = count_jobs_submitted(pjob); num_queued = get_num_queued(&users, user_name); if (num_queued + num_to_add > (unsigned int)max_queuable) can_queue_another = FALSE; } return(can_queue_another); } /* END can_queue_new_job() */
int modify_job_attr( job *pjob, /* I (modified) */ svrattrl *plist, /* I */ int perm, int *bad) /* O */ { int allow_unkn = -1; long i; pbs_attribute newattr[JOB_ATR_LAST]; pbs_attribute *pattr; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pque->qu_qs.qu_type != QTYPE_Execution) allow_unkn = JOB_ATR_UNKN; unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob->ji_parent_job != NULL) { allow_unkn = JOB_ATR_UNKN; } else { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 5"); return(PBSE_JOBNOTFOUND); } pattr = pjob->ji_wattr; /* call attr_atomic_set to decode and set a copy of the attributes */ rc = attr_atomic_set( plist, /* I */ pattr, /* I */ newattr, /* O */ job_attr_def, /* I */ JOB_ATR_LAST, allow_unkn, /* I */ perm, /* I */ bad); /* O */ /* if resource limits are being changed ... */ if ((rc == 0) && (newattr[JOB_ATR_resource].at_flags & ATR_VFLAG_SET)) { if ((perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) { /* If job is running, only manager/operator can raise limits */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { long lim = TRUE; int comp_resc_lt; get_svr_attr_l(SRV_ATR_QCQLimits, &lim); comp_resc_lt = comp_resc2(&pjob->ji_wattr[JOB_ATR_resource], &newattr[JOB_ATR_resource], lim, NULL, LESS); if (comp_resc_lt != 0) { rc = PBSE_PERM; } } /* Also check against queue and system limits */ if (rc == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { rc = chk_resc_limits( &newattr[JOB_ATR_resource], pque, NULL); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 6"); return(PBSE_JOBNOTFOUND); } else rc = PBSE_QUENOTAVAILABLE; } } } /* END if ((rc == 0) && ...) */ /* special check on permissions for hold */ if ((rc == 0) && (newattr[JOB_ATR_hold].at_flags & ATR_VFLAG_MODIFY)) { i = newattr[JOB_ATR_hold].at_val.at_long ^ (pattr + JOB_ATR_hold)->at_val.at_long; rc = chk_hold_priv(i, perm); } if (rc == 0) { for (i = 0;i < JOB_ATR_LAST;i++) { if (newattr[i].at_flags & ATR_VFLAG_MODIFY) { if (job_attr_def[i].at_action) { rc = job_attr_def[i].at_action( &newattr[i], pjob, ATR_ACTION_ALTER); if (rc) break; } } } /* END for (i) */ if ((rc == 0) && ((newattr[JOB_ATR_userlst].at_flags & ATR_VFLAG_MODIFY) || (newattr[JOB_ATR_grouplst].at_flags & ATR_VFLAG_MODIFY))) { /* need to reset execution uid and gid */ rc = set_jobexid(pjob, newattr, NULL); } if ((rc == 0) && (newattr[JOB_ATR_outpath].at_flags & ATR_VFLAG_MODIFY)) { /* need to recheck if JOB_ATR_outpath is a special case of host only */ if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == ':') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_outpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'o'); /* don't call free_dynamic_string() */ free(ds); } /* * if the output path was specified and ends with a '/' * then append the standard file name */ else if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == '/') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] = '\0'; replace_attr_string(&newattr[JOB_ATR_outpath], (add_std_filename(pjob, newattr[JOB_ATR_outpath].at_val.at_str, (int)'o', ds))); /* don't call free_dynamic_string because() we still want to use the allocated string */ free(ds); } } if ((rc == 0) && (newattr[JOB_ATR_errpath].at_flags & ATR_VFLAG_MODIFY)) { /* need to recheck if JOB_ATR_errpath is a special case of host only */ if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == ':') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_errpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'e'); /* don't call free_dynamic_string() */ free(ds); } /* * if the error path was specified and ends with a '/' * then append the standard file name */ else if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == '/') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] = '\0'; replace_attr_string(&newattr[JOB_ATR_errpath], (add_std_filename(pjob, newattr[JOB_ATR_errpath].at_val.at_str,(int)'e', ds))); /* don't call free_dynamic_string() */ free(ds); } } } /* END if (rc == 0) */ if (rc != 0) { for (i = 0;i < JOB_ATR_LAST;i++) job_attr_def[i].at_free(newattr + i); /* FAILURE */ return(rc); } /* END if (rc != 0) */ /* OK, now copy the new values into the job attribute array */ for (i = 0;i < JOB_ATR_LAST;i++) { if (newattr[i].at_flags & ATR_VFLAG_MODIFY) { if (LOGLEVEL >= 7) { sprintf(log_buf, "attr %s modified", job_attr_def[i].at_name); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } job_attr_def[i].at_free(pattr + i); if ((newattr[i].at_type == ATR_TYPE_LIST) || (newattr[i].at_type == ATR_TYPE_RESC)) { list_move( &newattr[i].at_val.at_list, &(pattr + i)->at_val.at_list); } else { *(pattr + i) = newattr[i]; } (pattr + i)->at_flags = newattr[i].at_flags; } } /* END for (i) */ /* note, the newattr[] attributes are on the stack, they go away automatically */ pjob->ji_modified = 1; return(0); } /* END modify_job_attr() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter; time_t time_now = time(NULL); long poll_jobs = 0; char job_id[PBS_MAXSVRJOBID+1]; int job_substate = -1; time_t job_momstattime = -1; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } iter = -1; get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (!poll_jobs) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = svr_find_job(cntl->sc_jobid, FALSE); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); } else if (type == tjstQueue) { pjob = next_job(cntl->sc_pque->qu_jobs,&iter); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } } job_array_index++; } } else { pjob = next_job(&alljobs,&iter); } } /* END if (pjob == NULL) */ else { strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); break; } } } } else pjob = next_job(&alljobs,&iter); } if (pjob == NULL) break; strcpy(job_id, pjob->ji_qs.ji_jobid); job_substate = pjob->ji_qs.ji_substate; job_momstattime = pjob->ji_momstat; strcpy(cntl->sc_jobid, job_id); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); pjob = NULL; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((job_substate == JOB_SUBSTATE_RUNNING) && ((time_now - job_momstattime) > JobStatRate)) { /* go to MOM for status */ if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC) break; if (rc != 0) { pjob = svr_find_job(job_id, FALSE); rc = 0; continue; } if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return; /* will pick up after mom replies */ } } /* END while(1) */ if (rc != 0) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; int iter = -1; /* loop through all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); iter = -1; while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); reply_send_svr(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; if (pque->qu_qs.qu_type != QTYPE_Execution) { unlock_queue(pque, __func__, "not exec", LOGLEVEL); goto nextjob; } unlock_queue(pque, __func__, "exec", LOGLEVEL); } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); rc = 0; } /* END while (pjob != NULL) */ if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int req_stat_job( struct batch_request *preq) /* ptr to the decoded request */ { struct stat_cntl *cntl; /* see svrfunc.h */ char *name; job *pjob = NULL; pbs_queue *pque = NULL; int rc = PBSE_NONE; long poll_jobs = 0; char log_buf[LOCAL_LOG_BUF_SIZE]; enum TJobStatTypeEnum type = tjstNONE; /* * first, validate the name of the requested object, either * a job, a queue, or the whole server. */ if (LOGLEVEL >= 7) { sprintf(log_buf, "note"); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } /* FORMAT: name = { <JOBID> | <QUEUEID> | '' } */ name = preq->rq_ind.rq_status.rq_id; if (preq->rq_extend != NULL) { /* evaluate pbs_job_stat() 'extension' field */ if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated"))) { /* truncate response by 'max_report' */ type = tjstTruncatedServer; } else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays"))) { type = tjstSummarizeArraysServer; } } /* END if (preq->rq_extend != NULL) */ if (isdigit((int)*name)) { /* status a single job */ if (is_array(name)) { if (type != tjstSummarizeArraysServer) { type = tjstArray; } } else { type = tjstJob; if ((pjob = svr_find_job(name, FALSE)) == NULL) { rc = PBSE_UNKJOBID; } else unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if (isalpha(name[0])) { if (type == tjstNONE) type = tjstQueue; else if (type == tjstSummarizeArraysServer) type = tjstSummarizeArraysQueue; else type = tjstTruncatedQueue; /* if found, this mutex is released later */ if ((pque = find_queuebyname(name)) == NULL) { rc = PBSE_UNKQUE; } } else if ((*name == '\0') || (*name == '@')) { /* status all jobs at server */ if (type == tjstNONE) type = tjstServer; } else { rc = PBSE_IVALREQ; } if (rc != 0) { /* is invalid - an error */ req_reject(rc, 0, preq, NULL, NULL); return(rc); } preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preq->rq_reply.brp_un.brp_status); cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl)); if (cntl == NULL) { if (pque != NULL) unlock_queue(pque, "req_stat_job", "no memory cntl", LOGLEVEL); req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(PBSE_SYSTEM); } if ((type == tjstTruncatedQueue) || (type == tjstTruncatedServer)) { if (pque != NULL) { unlock_queue(pque, __func__, "", LOGLEVEL); pque = NULL; } } cntl->sc_type = (int)type; cntl->sc_conn = -1; cntl->sc_pque = pque; cntl->sc_origrq = preq; cntl->sc_post = req_stat_job_step2; cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */ get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (poll_jobs) cntl->sc_post = 0; /* we're not going to make clients wait */ req_stat_job_step2(cntl); /* go to step 2, see if running is current */ if (pque != NULL) unlock_queue(pque, "req_stat_job", "success", LOGLEVEL); free(cntl); return(PBSE_NONE); } /* END req_stat_job() */
void svr_mailowner( job *pjob, /* I */ int mailpoint, /* note, single character */ int force, /* if set to MAIL_FORCE, force mail delivery */ const char *text) /* (optional) additional message text */ { static const char *memory_err = "Cannot allocate memory to send email"; char mailto[1024]; char *domain = NULL; int i; mail_info *mi; long no_force = FALSE; struct array_strings *pas; memset(mailto, 0, sizeof(mailto)); get_svr_attr_str(SRV_ATR_MailDomain, &domain); if ((domain != NULL) && (!strcasecmp("never", domain))) { /* never send user mail under any conditions */ if (LOGLEVEL >= 3) { log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "Not sending email: Mail domain set to 'never'\n"); } return; } if (LOGLEVEL >= 3) { char tmpBuf[LOG_BUF_SIZE]; snprintf(tmpBuf, LOG_BUF_SIZE, "preparing to send '%c' mail for job %s to %s (%.64s)\n", (char)mailpoint, pjob->ji_qs.ji_jobid, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, (text != NULL) ? text : "---"); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, tmpBuf); } /* * if force is true, force the mail out regardless of mailpoint * unless server no_mail_force attribute is set to true */ get_svr_attr_l(SRV_ATR_NoMailForce, &no_force); if ((force != MAIL_FORCE) || (no_force == TRUE)) { if (pjob->ji_wattr[JOB_ATR_mailpnts].at_flags & ATR_VFLAG_SET) { if (*(pjob->ji_wattr[JOB_ATR_mailpnts].at_val.at_str) == MAIL_NONE) { /* do not send mail. No mail requested on job */ log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "Not sending email: job requested no e-mail"); return; } /* see if user specified mail of this type */ if (strchr( pjob->ji_wattr[JOB_ATR_mailpnts].at_val.at_str, mailpoint) == NULL) { /* do not send mail */ log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "Not sending email: User does not want mail of this type.\n"); return; } } else if (mailpoint != MAIL_ABORT) /* not set, default to abort */ { log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "Not sending email: Default mailpoint does not include this type.\n"); return; } } mi = (mail_info *)calloc(1, sizeof(mail_info)); if (mi == NULL) { log_err(ENOMEM, __func__, memory_err); return; } /* Who does the mail go to? If mail-list, them; else owner */ mailto[0] = '\0'; if (pjob->ji_wattr[JOB_ATR_mailuser].at_flags & ATR_VFLAG_SET) { /* has mail user list, send to them rather than owner */ pas = pjob->ji_wattr[JOB_ATR_mailuser].at_val.at_arst; if (pas != NULL) { for (i = 0;i < pas->as_usedptr;i++) { if ((strlen(mailto) + strlen(pas->as_string[i]) + 2) < sizeof(mailto)) { strcat(mailto, pas->as_string[i]); strcat(mailto, " "); } } } } else { /* no mail user list, just send to owner */ if (domain != NULL) { snprintf(mailto, sizeof(mailto), "%s@%s", pjob->ji_wattr[JOB_ATR_euser].at_val.at_str, domain); if (LOGLEVEL >= 5) { char tmpBuf[LOG_BUF_SIZE]; snprintf(tmpBuf,sizeof(tmpBuf), "Updated mailto from job owner and mail domain: '%s'\n", mailto); log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, tmpBuf); } } else { #ifdef TMAILDOMAIN snprintf(mailto, sizeof(mailto), "%s@%s", pjob->ji_wattr[JOB_ATR_euser].at_val.at_str, TMAILDOMAIN); #else /* TMAILDOMAIN */ snprintf(mailto, sizeof(mailto), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); #endif /* TMAILDOMAIN */ if (LOGLEVEL >= 5) { char tmpBuf[LOG_BUF_SIZE]; snprintf(tmpBuf,sizeof(tmpBuf), "Updated mailto from job owner: '%s'\n", mailto); log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, tmpBuf); } } } /* initialize the mail information */ if ((mi->mailto = strdup(mailto)) == NULL) { log_err(ENOMEM, __func__, memory_err); free(mi); return; } mi->mail_point = mailpoint; if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) { mi->exec_host = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if (mi->exec_host == NULL) { log_err(ENOMEM, __func__, memory_err); free(mi); return; } } else mi->exec_host = NULL; if ((mi->jobid = strdup(pjob->ji_qs.ji_jobid)) == NULL) { log_err(ENOMEM, __func__, memory_err); free(mi); return; } if (pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str != NULL) { mi->jobname = strdup(pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str); if (mi->jobname == NULL) { log_err(ENOMEM, __func__, memory_err); free(mi); return; } } else mi->jobname = NULL; if (text) { if ((mi->text = strdup(text)) == NULL) { free(mi); log_err(ENOMEM, __func__, memory_err); return; } } else mi->text = NULL; /* have a thread do the work of sending the mail */ enqueue_threadpool_request(send_the_mail,mi); return; } /* END svr_mailowner() */
int req_rerunjob( struct batch_request *preq) { int rc = PBSE_NONE; job *pjob; int Force; int MgrRequired = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if requestor is admin, job owner, etc */ if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0) { /* FAILURE */ /* chk_job_request calls req_reject() */ rc = PBSE_SYSTEM; return rc; /* This needs to fixed to return an accurate error */ } /* the job must be running or completed */ if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET) { /* allow end-users to rerun checkpointed jobs */ MgrRequired = FALSE; } } else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* job is running */ /* NO-OP */ } else { /* FAILURE - job is in bad state */ rc = PBSE_BADSTATE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return rc; } if ((MgrRequired == TRUE) && ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)) { /* FAILURE */ rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return rc; } /* the job must be rerunnable */ if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0) { /* NOTE: should force override this constraint? maybe (???) */ /* no, the user is saying that the job will break, and IEEE Std 1003.1 specifically says rerun is to be rejected if rerunable==FALSE -garrick */ rc = PBSE_NORERUN; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable", preq->rq_ind.rq_rerun); req_reject(rc, 0, preq, NULL, log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); return rc; } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* ask MOM to kill off the job if it is running */ static const char *rerun = "rerun"; char *extra = strdup(rerun); rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra); } else { if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n) { svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE); } else { svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE); } /* reset some job attributes */ pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET; set_statechar(pjob); rc = -1; } if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case - 1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ if (pjob != NULL) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; if (pjob != NULL) { get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0) { /* in case of server shutdown, don't clear exec_host */ /* will use it on hotstart when next comes up */ job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]); job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]); job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]); } pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(pjob, &newstate, &newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } } break; } /* END switch (rc) */ /* So job has run and is to be rerun (not restarted) */ if (pjob == NULL) { rc = PBSE_JOB_RERUN; } else { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } return rc; } /* END req_rerunjob() */
int process_request( struct tcp_chan *chan) /* file descriptor (socket) to get request */ { int rc = PBSE_NONE; struct batch_request *request = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; long acl_enable = FALSE; long state = SV_STATE_DOWN; time_t time_now = time(NULL); int free_request = TRUE; char tmpLine[MAXLINE]; char *auth_err = NULL; enum conn_type conn_active; unsigned short conn_socktype; unsigned short conn_authen; unsigned long conn_addr; int sfds = chan->sock; pthread_mutex_lock(svr_conn[sfds].cn_mutex); conn_active = svr_conn[sfds].cn_active; conn_socktype = svr_conn[sfds].cn_socktype; conn_authen = svr_conn[sfds].cn_authen; conn_addr = svr_conn[sfds].cn_addr; svr_conn[sfds].cn_lasttime = time_now; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); if ((request = alloc_br(0)) == NULL) { snprintf(tmpLine, sizeof(tmpLine), "cannot allocate memory for request from %lu", conn_addr); req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_SYSTEM; goto process_request_cleanup; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (conn_active == FromClientDIS || conn_active == ToServerDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((conn_socktype & PBS_SOCK_UNIX) && (conn_authen != PBS_NET_CONN_AUTHENTICATED)) { /* get_creds interestingly always returns 0 */ get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(chan, request); } else { char out[80]; snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", conn_active,conn_socktype, netaddr_long(conn_addr, out)); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", tmpLine); snprintf(tmpLine, sizeof(tmpLine), "request on invalid type of connection (%d) from %s", conn_active, netaddr_long(conn_addr, out)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (rc == -1) { /* FAILURE */ /* premature end of file */ rc = PBSE_PREMATURE_EOF; goto process_request_cleanup; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ goto process_request_cleanup; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); free_request = FALSE; goto process_request_cleanup; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { sprintf(log_buf, "%s: %lu", pbse_to_txt(PBSE_BADHOST), conn_addr); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", conn_addr); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (LOGLEVEL >= 1) { sprintf(log_buf, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf); } /* is the request from a host acceptable to the server */ if (conn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable); if (acl_enable) { /* acl enabled, check it; always allow myself and nodes */ struct array_strings *pas = NULL; struct pbsnode *isanode; get_svr_attr_arst(SRV_ATR_acl_hosts, &pas); isanode = PGetNodeFromAddr(conn_addr); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0)) { char tmpLine[MAXLINE]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (isanode != NULL) unlock_node(isanode, "process_request", NULL, LOGLEVEL); } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ conn_credent[sfds].timestamp = time_now; request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (conn_socktype == PBS_SOCK_INET) { rc = PBSE_IVALREQ; req_reject(rc, 0, request, NULL, NULL); free_request = FALSE; goto process_request_cleanup; } } if (conn_socktype & PBS_SOCK_UNIX) { pthread_mutex_lock(svr_conn[sfds].cn_mutex); svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = PBSE_NONE; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if (request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); free_request = FALSE; goto process_request_cleanup; } else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (conn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); free_request = FALSE; goto process_request_cleanup; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ get_svr_attr_l(SRV_ATR_State, &state); if (state > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); rc = PBSE_SVRDOWN; free_request = FALSE; goto process_request_cleanup; /*NOTREACHED*/ break; } } /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ rc = dispatch_request(sfds, request); return(rc); process_request_cleanup: if (free_request == TRUE) free_br(request); return(rc); } /* END process_request() */
int process_status_info( const char *nd_name, std::vector<std::string> &status_info) { const char *name = nd_name; struct pbsnode *current; long mom_job_sync = FALSE; long auto_np = FALSE; long down_on_error = FALSE; int dont_change_state = FALSE; pbs_attribute temp; int rc = PBSE_NONE; bool send_hello = false; get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync); get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np); get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error); /* Before filling the "temp" pbs_attribute, initialize it. * The second and third parameter to decode_arst are never * used, so just leave them empty. (GBS) */ memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if original node cannot be found do not process the update */ if ((current = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); //A node we put to sleep is up and running. if (current->nd_power_state != POWER_STATE_RUNNING) { //Make sure we wait for a stray update that came after we changed the state to pass //by. if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL)) { current->nd_power_state = POWER_STATE_RUNNING; write_node_power_state(); } } /* loop over each string */ for (unsigned int i = 0; i != status_info.size(); i++) { const char *str = status_info[i].c_str(); /* these two options are for switching nodes */ if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_numa_from_str(str, current)) == NULL) break; else continue; } else if (!strncmp(str, "node=", strlen("node="))) { /* if we've already processed some, save this before moving on */ if (i != 0) save_node_status(current, &temp); dont_change_state = FALSE; if ((current = get_node_from_str(str, name, current)) == NULL) break; else { if (current->nd_mom_reported_down == TRUE) { /* There is a race condition if using a mom hierarchy and manually * shutting down a non-level 1 mom: if its message that the mom is * shutting down gets there before its last status update, the node * can incorrectly be set as free again. For that reason, only set * a mom back up if its reporting for itself. */ if (strcmp(name, str + strlen("node=")) != 0) dont_change_state = TRUE; else current->nd_mom_reported_down = FALSE; } continue; } } /* add the info to the "temp" pbs_attribute */ else if (!strcmp(str, START_GPU_STATUS)) { is_gpustat_get(current, i, status_info); str = status_info[i].c_str(); } else if (!strcmp(str, START_MIC_STATUS)) { process_mic_status(current, i, status_info); str = status_info[i].c_str(); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "layout", 6)) { if (current->nd_layout == NULL) { current->nd_layout = new Machine(status_info[i]); } continue; } #endif else if (!strcmp(str, "first_update=true")) { /* mom is requesting that we send the mom hierarchy file to her */ //remove_hello(&hellos, current->nd_id); send_hello = true; /* reset gpu data in case mom reconnects with changed gpus */ clear_nvidia_gpus(current); } else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { DBPRT(("is_stat_get: cannot add attributes\n")); free_arst(&temp); break; } if (!strncmp(str, "state", 5)) { if (dont_change_state == FALSE) process_state_str(current, str); } else if ((allow_any_mom == TRUE) && (!strncmp(str, "uname", 5))) { process_uname_str(current, str); } else if (!strncmp(str, "me", 2)) /* shorter str compare than "message" */ { if ((!strncmp(str, "message=ERROR", 13)) && (down_on_error == TRUE)) { update_node_state(current, INUSE_DOWN); dont_change_state = TRUE; set_note_error(current, str); } } else if (!strncmp(str,"macaddr=",8)) { update_node_mac_addr(current,str + 8); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobdata=", 8))) { /* update job attributes based on what the MOM gives us */ update_job_data(current, str + strlen("jobdata=")); } else if ((mom_job_sync == TRUE) && (!strncmp(str, "jobs=", 5))) { /* walk job list reported by mom */ size_t len = strlen(str) + strlen(current->nd_name) + 2; char *jobstr = (char *)calloc(1, len); sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info)); if ((jobstr != NULL) && (sji != NULL)) { sprintf(jobstr, "%s:%s", current->nd_name, str+5); sji->input = jobstr; sji->timestamp = time(NULL); /* sji must be freed in sync_node_jobs */ enqueue_threadpool_request(sync_node_jobs, sji, task_pool); } else { if (jobstr != NULL) { free(jobstr); } if (sji != NULL) { free(sji); } } } else if (auto_np) { if (!(strncmp(str, "ncpus=", 6))) { handle_auto_np(current, str); } } } /* END processing strings */ if (current != NULL) { save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && (send_hello == true)) rc = SEND_HELLO; return(rc); } /* END process_status_info() */
int process_alps_status( char *nd_name, dynamic_string *status_info) { char *str; char *ccu_p = NULL; char *current_node_id = NULL; char node_index_buf[MAXLINE]; int node_index = 0; struct pbsnode *parent; struct pbsnode *current = NULL; int rc; pbs_attribute temp; hash_table_t *rsv_ht; char log_buf[LOCAL_LOG_BUF_SIZE]; memset(&temp, 0, sizeof(temp)); if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute"); return(rc); } /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* keep track of reservations so that they're only processed once per update */ rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE); /* loop over each string */ for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1) { if (!strncmp(str, "node=", strlen("node="))) { if (str != status_info->str) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); if (current != NULL) save_node_status(current, &temp); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else continue; } if (current == NULL) continue; /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { process_gpu_status(current, &str); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { char *just_rsv_id = str + strlen(reservation_id); if (get_value_hash(rsv_ht, just_rsv_id) == -1) { add_hash(rsv_ht, 1, strdup(just_rsv_id)); /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ unlock_node(parent, __func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->nd_name); unlock_node(current, __func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) { /* reporter node disappeared - this shouldn't be possible */ log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation"); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ unlock_node(parent, __func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); free(current_node_id); return(PBSE_NONE); } free(current_node_id); current_node_id = NULL; } } /* save this as is to the status strings */ else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE) { free_arst(&temp); free_all_keys(rsv_ht); free_hash(rsv_ht); return(rc); } /* perform any special processing */ if (!strncmp(str, ccu_eq, ac_ccu_eq_len)) { /* save compute unit count in case we need it */ /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */ /* for the node */ ccu_p = str; } else if (!strncmp(str, cproc_eq, ac_cproc_eq_len)) { int ncpus; long svr_nppcu_value = 0; /* * Get the server nppcu value which determines how Hyper-Threaded * cores are reported. When server nppcu value is: * * 0 - Let ALPS choose whether or not to use Hyper-Threaded cores * (report all cores) * 1 - Do not use Hyper-Threaded cores * (report only physical core (compute unit count) * 2 - Use Hyper-Threaded cores * (report all cores) */ get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value); if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL) { /* no HT (nppcu==1), so use compute unit count */ ncpus = atoi(ccu_p + ac_ccu_eq_len); /* use CPROC value if we are using APBASIL protocol < 1.3 */ if (ncpus == 0) ncpus = atoi(str + ac_cproc_eq_len); /* reset the pointer */ ccu_p = NULL; } else { /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */ ncpus = atoi(str + ac_cproc_eq_len); } set_ncpus(current, parent, ncpus); } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } } /* END processing the status update */ if (current != NULL) { snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); unlock_node(current, __func__, NULL, LOGLEVEL); } unlock_node(parent, __func__, NULL, LOGLEVEL); free_all_keys(rsv_ht); free_hash(rsv_ht); return(PBSE_NONE); } /* END process_alps_status() */
int setup_array_struct( job *pjob) { job_array *pa; array_request_node *rn; int bad_token_count; int array_size; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; long max_array_size; pa = (job_array *)calloc(1,sizeof(job_array)); pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid); strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix); snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s", get_variable(pjob, pbs_o_host)); pa->ai_qs.num_cloned = 0; CLEAR_HEAD(pa->request_tokens); pa->ai_mutex = calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(pa->ai_mutex,NULL); lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); if (job_save(pjob, SAVEJOB_FULL, 0) != 0) { /* the array is deleted in svr_job_purge */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); svr_job_purge(pjob); /* Does job array need to be removed? */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", "cannot save job"); } return(1); } if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa))) { long max_limit = 0; get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit); array_delete(pa); snprintf(log_buf,sizeof(log_buf), "Array %s requested a slot limit above the max limit %ld, rejecting\n", pa->ai_qs.parent_id, max_limit); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,pa->ai_qs.parent_id,log_buf); return(INVALID_SLOT_LIMIT); } pa->ai_qs.jobs_running = 0; pa->ai_qs.num_started = 0; pa->ai_qs.num_failed = 0; pa->ai_qs.num_successful = 0; bad_token_count = parse_array_request( pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, &(pa->request_tokens)); /* get the number of elements that should be allocated in the array */ rn = (array_request_node *)GET_NEXT(pa->request_tokens); array_size = 0; pa->ai_qs.num_jobs = 0; while (rn != NULL) { if (rn->end > array_size) array_size = rn->end; /* calculate the actual number of jobs (different from array size) */ pa->ai_qs.num_jobs += rn->end - rn->start + 1; rn = (array_request_node *)GET_NEXT(rn->request_tokens_link); } /* size of array is the biggest index + 1 */ array_size++; if (get_svr_attr_l(SRV_ATR_MaxArraySize, &max_array_size) == PBSE_NONE) { if (max_array_size < pa->ai_qs.num_jobs) { array_delete(pa); return(ARRAY_TOO_LARGE); } } /* initialize the array */ pa->job_ids = calloc(array_size, sizeof(char *)); if (pa->job_ids == NULL) { sprintf(log_buf, "Failed to alloc job_ids: job %s", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(PBSE_MEM_MALLOC); } /* remember array_size */ pa->ai_qs.array_size = array_size; CLEAR_HEAD(pa->ai_qs.deps); array_save(pa); if (bad_token_count > 0) { array_delete(pa); return 2; } pjob->ji_arraystruct = pa; insert_array(pa); unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return(PBSE_NONE); } /* END setup_array_struct() */
int finalize_rerunjob(struct batch_request *preq,job *pjob,int rc) { int Force; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))) Force = 1; else Force = 0; switch (rc) { case -1: /* completed job was requeued */ /* clear out job completion time if there is one */ break; case 0: /* requeue request successful */ if (pjob != NULL) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; break; case PBSE_SYSTEM: /* This may not be accurate...*/ rc = PBSE_MEM_MALLOC; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; break; default: if (Force == 0) { rc = PBSE_MOMREJECT; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom"); req_reject(rc, 0, preq, NULL, log_buf); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); return rc; } else { int newstate; int newsubst; unsigned int dummy; char *tmp; long cray_enabled = FALSE; if (pjob != NULL) { get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy); else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy); /* Cannot communicate with MOM, forcibly requeue job. This is a relatively disgusting thing to do */ sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job", tmp, rc); free(tmp); log_event( PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); log_err(-1, __func__, log_buf); strcat(log_buf, ", previous output files may be lost"); svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE); rel_resc(pjob); /* free resc assigned to job */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0) { /* in case of server shutdown, don't clear exec_host */ /* will use it on hotstart when next comes up */ job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]); job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]); job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]); } pjob->ji_modified = 1; /* force full job save */ pjob->ji_momhandle = -1; pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn; svr_evaljobstate(pjob, &newstate, &newsubst, 0); svr_setjobstate(pjob, newstate, newsubst, FALSE); } } break; } /* END switch (rc) */ /* So job has run and is to be rerun (not restarted) */ if (pjob == NULL) { rc = PBSE_JOB_RERUN; } else { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); reply_ack(preq); /* note in accounting file */ account_record(PBS_ACCT_RERUN, pjob, NULL); unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } return rc; } /* END req_rerunjob() */
int status_job( job *pjob, /* ptr to job to status */ batch_request *preq, svrattrl *pal, /* specific attributes to status */ tlist_head *pstathd, /* RETURN: head of list to append status to */ bool condensed, int *bad) /* RETURN: index of first bad pbs_attribute */ { struct brp_status *pstat; int IsOwner = 0; long query_others = 0; long condensed_timeout = JOB_CONDENSED_TIMEOUT; /* Make sure procct is removed from the job resource attributes */ remove_procct(pjob); /* see if the client is authorized to status this job */ if (svr_authorize_jobreq(preq, pjob) == 0) IsOwner = 1; get_svr_attr_l(SRV_ATR_query_others, &query_others); if (!query_others) { if (IsOwner == 0) { return(PBSE_PERM); } } get_svr_attr_l(SRV_ATR_job_full_report_time, &condensed_timeout); // if the job has been modified within the timeout, send the full output if ((condensed == true) && (time(NULL) < pjob->ji_mod_time + condensed_timeout)) condensed = false; /* allocate reply structure and fill in header portion */ if ((pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status))) == NULL) { return(PBSE_SYSTEM); } CLEAR_LINK(pstat->brp_stlink); pstat->brp_objtype = MGR_OBJ_JOB; strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid); CLEAR_HEAD(pstat->brp_attr); append_link(pstathd, &pstat->brp_stlink, pstat); /* add attributes to the status reply */ *bad = 0; if (status_attrib( pal, job_attr_def, pjob->ji_wattr, JOB_ATR_LAST, preq->rq_perm, &pstat->brp_attr, condensed, bad, IsOwner)) { return(PBSE_NOATTR); } return (0); } /* END status_job() */
void rerun_or_kill( job **pjob_ptr, /* I (modified/freed) */ char *text) /* I */ { long server_state = SV_STATE_DOWN; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; job *pjob = *pjob_ptr; get_svr_attr_l(SRV_ATR_State, &server_state); if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long) { /* job is rerunable, mark it to be requeued */ issue_signal(&pjob, "SIGKILL", free_br, NULL); if (pjob != NULL) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if ((pque = get_jobs_queue(&pjob)) != NULL) { snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text); unlock_queue(pque, __func__, NULL, LOGLEVEL); } } } else if (server_state != SV_STATE_SHUTDEL) { /* job not rerunable, immediate shutdown - kill it off */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text); /* need to record log message before purging job */ log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); job_abt(pjob_ptr, log_buf); return; } else { /* delayed shutdown, leave job running */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text); } if (pjob != NULL) { log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } return; } /* END rerun_or_kill() */
/* * get_correct_jobname() - makes sure the job searches for the correct name * necessary because of SRV_ATR_display_job_server_suffix and * SRV_ATR_job_suffix_alias * * allocs the correct job name * @param jobid (I) - the jobid as passed in (NUM.SERVER_NAME) * @return a pointer to the correct job name (alloc'd) */ char *get_correct_jobname( const char *jobid) /* I */ { char *correct = NULL; char *dot; char *work_jobid = strdup(jobid); /* first suffix could be the server name or the alias */ char *first_suffix = NULL; /* second suffix can only be the alias */ char *second_suffix = NULL; int server_suffix = TRUE; int len; long display_suffix = TRUE; char *alias = NULL; get_svr_attr_l(SRV_ATR_display_job_server_suffix, &display_suffix); if (display_suffix == FALSE) server_suffix = FALSE; if ((dot = strchr((char *)work_jobid,'.')) != NULL) { first_suffix = dot + 1; if ((dot = strchr(first_suffix,'.')) != NULL) { second_suffix = dot + 1; } dot = NULL; } /* check current settings */ get_svr_attr_str(SRV_ATR_job_suffix_alias, &alias); if ((alias != NULL) && (server_suffix == TRUE)) { /* display the server suffix and the alias */ /* check if alias is already there */ if (second_suffix != NULL) { if (strcmp(second_suffix,alias) == 0) { correct = strdup(jobid); if (correct == NULL) log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(correct); } } else if (first_suffix == NULL) { /* alloc memory and sprint, add 3 for 2 '.' and NULL terminator */ len = strlen(work_jobid) + strlen(server_name) + strlen(alias) + 3; correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s.%s.%s", work_jobid,server_name,alias); } else { /* add 2 for null terminator and '.' */ len = strlen(alias) + 2 + strlen(work_jobid); correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s.%s",work_jobid,alias); } } /* END if (server_suffix && alias) */ else if (server_suffix == TRUE) { /* just the server suffix */ if (first_suffix != NULL) { correct = strdup(work_jobid); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } } else { len = strlen(work_jobid) + strlen(server_name) + 2; correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s.%s", work_jobid,server_name); } } /* END if (just server_suffix) */ else if (alias != NULL) { /* just the alias, not the server */ if (first_suffix == NULL) { len = strlen(work_jobid) + strlen(alias) + 2; correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s.%s",work_jobid,alias); } else { len = strlen(alias) + 2; dot = first_suffix - 1; *dot = '\0'; len += strlen(work_jobid); correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s.%s", work_jobid, alias); *dot = '.'; } } /* END else if (just alias) */ else { /* no server suffix nor alias */ if (first_suffix != NULL) { dot = first_suffix - 1; *dot = '\0'; } len = strlen(work_jobid) + 1; correct = (char *)calloc(1, len); if (correct == NULL) { log_err(-1, __func__, "ERROR: Fatal - Cannot allocate memory\n"); free(work_jobid); return(NULL); } snprintf(correct,len,"%s",work_jobid); if (first_suffix != NULL) *dot = '.'; } free(work_jobid); return(correct); } /* END get_correct_jobname() */
void svr_shutdown( int type) /* I */ { pbs_attribute *pattr; job *pjob; long state = SV_STATE_DOWN; int iter; char log_buf[LOCAL_LOG_BUF_SIZE]; close(lockfds); save_queues(); /* Lets start by logging shutdown and saving everything */ get_svr_attr_l(SRV_ATR_State, &state); strcpy(log_buf, msg_shutdown_start); if (state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { state = SV_STATE_DOWN; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Forced"); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); return; } } if (type == SHUT_IMMEDIATE) { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Immediate"); } else if (type == SHUT_DELAY) { state = SV_STATE_SHUTDEL; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Delayed"); } else if (type == SHUT_QUICK) { state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Quick"); } else { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "By Signal"); } log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */ { return; } svr_save(&server, SVR_SAVE_QUICK); iter = -1; while ((pjob = next_job(&alljobs,&iter)) != NULL) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL))) { /* do checkpoint of job */ if (shutdown_checkpoint(&pjob) == 0) { if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } } /* if no checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(&pjob, msg_on_shutdown); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } return; } /* END svr_shutdown() */
int authenticate_user( struct batch_request *preq, /* I */ struct credential *pcred, char **autherr) /* O */ { int rc; char uath[PBS_MAXUSER + PBS_MAXHOSTNAME + 1]; time_t time_now = time(NULL); char error_msg[1024]; bool acl_enabled = false; #ifdef MUNGE_AUTH if (strncmp(preq->rq_user, pcred->username, PBS_MAXUSER)) { /* extra check for munge */ struct array_strings *my_acl = NULL; char uh[PBS_MAXUSER + PBS_MAXHOSTNAME + 2]; sprintf(uh, "%s@%s", preq->rq_user, pcred->hostname); get_svr_attr_arst(SRV_ATR_authusers, &my_acl); if ((acl_check_my_array_string(my_acl, uh, ACL_User_Host)) == 0) { *autherr = strdup("User not in authorized user list."); sprintf(error_msg, "%s Requested user %s: requested from host %s", *autherr, preq->rq_user, preq->rq_host); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg); return(PBSE_BADCRED); } } #else if (strncmp(preq->rq_user, pcred->username, PBS_MAXUSER)) { *autherr = strdup("Users do not match"); sprintf(error_msg, "%s: Requested user %s: credential user %s: requested from host %s", *autherr, preq->rq_user, pcred->username, preq->rq_host); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg); return(PBSE_BADCRED); } #endif if (strncmp(preq->rq_host, pcred->hostname, PBS_MAXHOSTNAME)) { struct sockaddr_in *sai1; struct sockaddr_in *sai2; struct addrinfo *addr_info1 = NULL; struct addrinfo *addr_info2 = NULL; sai1 = get_cached_addrinfo(preq->rq_host); sai2 = get_cached_addrinfo(pcred->hostname); if ((sai1 == NULL) && (pbs_getaddrinfo(preq->rq_host, NULL, &addr_info1) == PBSE_NONE)) { sai1 = (struct sockaddr_in *)addr_info1->ai_addr; } if ((sai2 == NULL) && (pbs_getaddrinfo(pcred->hostname, NULL, &addr_info2) == PBSE_NONE)) { sai2 = (struct sockaddr_in *)addr_info2->ai_addr; } if ((sai1 == NULL) || (sai2 == NULL) || (memcmp(sai1, sai2, sizeof(struct sockaddr_in)))) { *autherr = strdup("Hosts do not match"); sprintf(error_msg, "%s: Requested host %s: credential host: %s", *autherr, preq->rq_host, pcred->hostname); log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg); return(PBSE_BADCRED); } } if (pcred->timestamp) { long lifetime = 0; if (get_svr_attr_l(SRV_ATR_CredentialLifetime, &lifetime) == PBSE_NONE) { /* use configured value if set */ } else { /* if not use the default */ lifetime = CREDENTIAL_LIFETIME; } /* negative values mean that credentials have an infinite lifetime */ if (lifetime > -1) { if ((pcred->timestamp - CREDENTIAL_TIME_DELTA > time_now) || (pcred->timestamp + lifetime < time_now)) { return(PBSE_EXPIRED); } } } /* If Server's Acl_User enabled, check if user in list */ get_svr_attr_b(SRV_ATR_AclUserEnabled, &acl_enabled); if (acl_enabled) { struct array_strings *acl_users = NULL; snprintf(uath, sizeof(uath), "%s@%s", preq->rq_user, preq->rq_host); get_svr_attr_arst(SRV_ATR_AclUsers, &acl_users); if (acl_check_my_array_string(acl_users, uath, ACL_User) == 0) { int my_err; pbs_net_t connect_addr = get_hostaddr(&my_err, preq->rq_host); pbs_net_t server_addr = get_hostaddr(&my_err, server_host); #ifdef __CYGWIN__ if ((!IamAdminByName(preq->rq_user)) || (connect_addr != server_addr)) { return(PBSE_PERM); } #else /* __CYGWIN__ */ #ifdef PBS_ROOT_ALWAYS_ADMIN if ((strcmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0) || (connect_addr != server_addr)) { return(PBSE_PERM); } #else /* PBS_ROOT_ALWAYS_ADMIN */ return(PBSE_PERM); #endif /* PBS_ROOT_ALWAYS_ADMIN */ #endif /* __CYGWIN__ */ } } /* A site stub for additional checking */ rc = site_allow_u(preq->rq_user, preq->rq_host); return(rc); } /* END authenticate_user() */
/** * update_array_values() * * updates internal bookeeping values for job arrays * @param pa - array to update * @param pjob - the pjob that an event happened on * @param event - code for what event just happened */ void update_array_values( job_array *pa, /* I */ int old_state, /* I */ enum ArrayEventsEnum event, /* I */ char *job_id, long job_atr_hold, int job_exit_status) { long moab_compatible; switch (event) { case aeQueue: /* NYI, nothing needs to be done for this yet */ break; case aeRun: if (old_state != JOB_STATE_RUNNING) { pa->ai_qs.jobs_running++; pa->ai_qs.num_started++; } break; case aeTerminate: if (old_state == JOB_STATE_RUNNING) { if (pa->ai_qs.jobs_running > 0) pa->ai_qs.jobs_running--; } if (job_exit_status == 0) { pa->ai_qs.num_successful++; pa->ai_qs.jobs_done++; } else { pa->ai_qs.num_failed++; pa->ai_qs.jobs_done++; } array_save(pa); /* update slot limit hold if necessary */ if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE) moab_compatible = FALSE; if (moab_compatible != FALSE) { /* only need to update if the job wasn't previously held */ if ((job_atr_hold & HOLD_l) == FALSE) { int i; int newstate; int newsub; job *pj; /* find the first held job and release its hold */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], job_id)) continue; if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(pj, &newstate, &newsub, 1); svr_setjobstate(pj, newstate, newsub, FALSE); job_save(pj, SAVEJOB_FULL, 0); unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); break; } unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); } } } } break; default: /* log error? */ break; } set_array_depend_holds(pa); array_save(pa); } /* END update_array_values() */
int acct_job( job *pjob, /* I */ dynamic_string *ds) /* O */ { int rc; long cray_enabled = FALSE; int resc_access_perm = READ_ONLY; char local_buf[MAXLINE*4]; pbs_queue *pque; tlist_head attrlist; svrattrl *pal; if (pjob == NULL) { return(PBSE_NONE); } CLEAR_HEAD(attrlist); /* user */ /* acct_job is only called from account_jobstr and account_jobend. BufSize should be PBS_ACCT_MAX_RCD + 1 in size. */ sprintf(local_buf, "user=%s ", pjob->ji_wattr[JOB_ATR_euser].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* group */ sprintf(local_buf, "group=%s ", pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* account */ if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "account=%s ", pjob->ji_wattr[JOB_ATR_account].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } /* job name */ sprintf(local_buf, "jobname=%s ", pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); if ((pque = get_jobs_queue(&pjob)) != NULL) { /* queue name */ sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name); unlock_queue(pque, __func__, NULL, LOGLEVEL); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1"); return(PBSE_JOBNOTFOUND); } /* create time */ sprintf(local_buf, "ctime=%ld ", pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* queued time */ sprintf(local_buf, "qtime=%ld ", pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* eligible time, how long ready to run */ sprintf(local_buf, "etime=%ld ", pjob->ji_wattr[JOB_ATR_etime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* execution start time */ sprintf(local_buf, "start=%ld ", (long)pjob->ji_qs.ji_stime); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* user */ sprintf(local_buf, "owner=%s ", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* For large clusters strings can get pretty long. We need to see if there is a need to allocate a bigger buffer */ /* execution host name */ if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) { append_dynamic_string(ds, "exec_host="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); } get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET)) { append_dynamic_string(ds, "login_node="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); } /* now encode the job's resource_list pbs_attribute */ job_attr_def[JOB_ATR_resource].at_encode( &pjob->ji_wattr[JOB_ATR_resource], &attrlist, job_attr_def[JOB_ATR_resource].at_name, NULL, ATR_ENCODE_CLIENT, resc_access_perm); while ((pal = GET_NEXT(attrlist)) != NULL) { /* exec_host can use a lot of buffer space. Use a dynamic string */ append_dynamic_string(ds, pal->al_name); if (pal->al_resc != NULL) { append_dynamic_string(ds, "."); append_dynamic_string(ds, pal->al_resc); } append_dynamic_string(ds, "="); append_dynamic_string(ds, pal->al_value); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); delete_link(&pal->al_link); free(pal); } /* END while (pal != NULL) */ #ifdef ATTR_X_ACCT /* x attributes */ if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "x=%s ", pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } #endif /* SUCCESS */ return(PBSE_NONE); } /* END acct_job() */
int set_jobexid( job *pjob, /* I */ pbs_attribute *attrry, /* I */ char *EMsg) /* O (optional,minsize=1024) */ { int addflags = 0; pbs_attribute *pattr; char **pmem; struct group *gpent; int free_puser = FALSE; char *puser = NULL; char *at; char *usr_at_host = NULL; int len; struct passwd *pwent = NULL; char *pgrpn; int free_pgrpn = TRUE; char gname[PBS_MAXGRPN + 1]; #ifdef _CRAY struct udb *pudb; #endif char tmpLine[1024 + 1]; char log_buf[LOCAL_LOG_BUF_SIZE]; long disable_id_check = 0; int CheckID; /* boolean */ if (EMsg != NULL) EMsg[0] = '\0'; /* use the passed User_List if set, may be a newly modified one */ /* if not set, fall back to the job's actual User_List, may be same */ if (get_svr_attr_l(SRV_ATR_DisableServerIdCheck, &disable_id_check) != PBSE_NONE) CheckID = 1; else CheckID = !disable_id_check; if (CheckID == 0) { /* NOTE: use owner, not userlist - should this be changed? */ /* Yes, changed 10/17/2007 */ if (pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str != NULL) { /* start of change to use userlist instead of owner 10/17/2007 */ if ((attrry + JOB_ATR_userlst)->at_flags & ATR_VFLAG_SET) pattr = attrry + JOB_ATR_userlst; else pattr = &pjob->ji_wattr[JOB_ATR_userlst]; if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET) { puser = pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str; /* set the job's owner as the new user, appending @host if * the job's owner has that */ at = strchr(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str,'@'); len = strlen(puser) + 1; if (at != NULL) { len += strlen(at); usr_at_host = (char *)calloc(len, sizeof(char)); snprintf(usr_at_host,len,"%s%s", puser, at); } else { usr_at_host = (char *)calloc(len, sizeof(char)); snprintf(usr_at_host,len,"%s", puser); } free(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str = usr_at_host; } else if ((puser = geteusernam(pjob, pattr)) == NULL) { if (EMsg != NULL) snprintf(EMsg, 1024, "cannot locate user name in job"); return(PBSE_BADUSER); } free_puser = TRUE; sprintf(tmpLine, "%s", puser); /* end of change to use userlist instead of owner 10/17/2007 */ } else { strcpy(tmpLine, "???"); } } /* END if (CheckID == 0) */ else { int perm; if ((attrry + JOB_ATR_userlst)->at_flags & ATR_VFLAG_SET) pattr = attrry + JOB_ATR_userlst; else pattr = &pjob->ji_wattr[JOB_ATR_userlst]; free_puser = TRUE; if ((puser = geteusernam(pjob, pattr)) == NULL) { if (EMsg != NULL) snprintf(EMsg, 1024, "cannot locate user name in job"); return(PBSE_BADUSER); } pwent = getpwnam_ext(puser); perm = svr_get_privilege(puser,get_variable(pjob,(char *)"PBS_O_HOST")); if (pwent == NULL) { snprintf(log_buf,sizeof(log_buf), "User %s does not exist in server password file\n", puser); log_err(errno, __func__, log_buf); if (EMsg != NULL) snprintf(EMsg,1024,"%s",log_buf); free(puser); return(PBSE_BADUSER); } if ((pwent->pw_uid == 0) || (perm & ATR_DFLAG_MGWR)) { struct array_strings *pas = NULL; get_svr_attr_arst(SRV_ATR_AclRoot, &pas); /* add check here for virtual user */ if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET) { free(puser); free_puser = FALSE; puser = pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str; pwent = getpwnam_ext(puser); if (pwent == NULL) { snprintf(log_buf,sizeof(log_buf), "User %s does not exist in server password file\n", puser); log_err(errno, __func__, log_buf); if (EMsg != NULL) snprintf(EMsg,1024,"%s",log_buf); return(PBSE_BADUSER); } /* set the job's owner as the new user */ at = strchr(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str,'@'); len = strlen(puser) + 1; if (at != NULL) { len += strlen(at); usr_at_host = (char *)calloc(len, sizeof(char)); snprintf(usr_at_host,len,"%s%s", puser, at); } else { usr_at_host = (char *)calloc(len, sizeof(char)); snprintf(usr_at_host,len,"%s", puser); } free(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str = usr_at_host; } else if (get_svr_attr_arst(SRV_ATR_AclRoot, &pas) == PBSE_NONE) { if (acl_check_my_array_string(pas, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, ACL_User) == 0) { if (EMsg != NULL) snprintf(EMsg, 1024, "root user %s fails ACL check", puser); if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); /* root not allowed */ } } else if (pwent->pw_uid == 0) { if (EMsg != NULL) snprintf(EMsg, 1024, "root user %s not allowed", puser); if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); /* root not allowed */ } } /* END if (pwent->pw_uid == 0) */ else if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET) { /* cannot submit a proxy job if not root or a manager */ if (EMsg != NULL) { snprintf(EMsg, 1024, "User '%s' is attempting to submit a proxy job for user '%s' but is not a manager", puser, pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str); } snprintf(log_buf, 1024, "User '%s' is attempting to submit a proxy job for user '%s' but is not a manager", puser, pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str); log_err(PBSE_BADUSER, __func__, log_buf); if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); } if (site_check_user_map(pjob, puser, EMsg, LOGLEVEL) == -1) { if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); } snprintf(tmpLine, sizeof(tmpLine), "%s", puser); } /* END else (CheckID == 0) */ pattr = attrry + JOB_ATR_euser; job_attr_def[JOB_ATR_euser].at_free(pattr); job_attr_def[JOB_ATR_euser].at_decode(pattr, NULL, NULL, tmpLine, 0); #ifdef _CRAY /* on cray check UDB (user data base) for permission to batch it */ if ((pwent != NULL) && (puser != NULL)) { pudb = getudbuid(pwent->pw_uid); endudb(); if (pudb == UDB_NULL) { if (EMsg != NULL) snprintf(EMsg, 1024, "user %s not located in user data base", puser); if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); } if (pudb->ue_permbits & (PERMBITS_NOBATCH | PERMBITS_RESTRICTED)) { if (free_puser == TRUE) free(puser); return(PBSE_QACESS); } /* if account (qsub -A) not specified, set default from UDB */ pattr = attrry + JOB_ATR_account; if ((pattr->at_flags & ATR_VFLAG_SET) == 0) { job_attr_def[JOB_ATR_account].at_decode( pattr, NULL, NULL, (char *)acid2nam(pudb->ue_acids[0])); } } /* END if ((pwent != NULL) && (puser != NULL)) */ #endif /* _CRAY */ /* * now figure out the group name under which the job should execute * PBS requires that each group have an entry in the group file, * see the admin guide for the reason why... * * use the passed group_list if set, may be a newly modified one * if not set, fall back to the job's actual group_list, may be same */ if ((attrry + JOB_ATR_grouplst)->at_flags & ATR_VFLAG_SET) pattr = attrry + JOB_ATR_grouplst; else pattr = &pjob->ji_wattr[JOB_ATR_grouplst]; /* extract user-specified egroup if it exists */ pgrpn = getegroup(pjob, pattr); if (pgrpn == NULL) { free_pgrpn = FALSE; if ((pwent != NULL) || ((pwent = getpwnam_ext(puser)) != NULL)) { /* egroup not specified - use user login group */ gpent = getgrgid(pwent->pw_gid); if (gpent != NULL) { pgrpn = gpent->gr_name; /* use group name */ } else { sprintf(gname, "%ld", (long)pwent->pw_gid); pgrpn = gname; /* turn gid into string */ } } else if (CheckID == 0) { strcpy(gname, "???"); pgrpn = gname; } else { log_err(errno, __func__, (char *)"getpwnam failed"); if (EMsg != NULL) snprintf(EMsg, 1024, "user does not exist in server password file"); if (free_puser == TRUE) free(puser); return(PBSE_BADUSER); } /* * setting the DEFAULT flag is a "kludgy" way to keep MOM from * having to do an unneeded look up of the group file. * We needed to have JOB_ATR_egroup set for the server but * MOM only wants it if it is not the login group, so there! */ addflags = ATR_VFLAG_DEFLT; } /* END if ((pgrpn = getegroup(pjob,pattr))) */ else if (CheckID == 0) { /* egroup specified - do not validate group within server */ /* NO-OP */ } else { /* user specified a group, group must exist and either */ /* must be user's primary group or the user must be in it */ gpent = getgrnam_ext(pgrpn); if (gpent == NULL) { if (CheckID == 0) { strcpy(gname, "???"); pgrpn = gname; } else if (EMsg != NULL) snprintf(EMsg, 1024, "cannot locate group %s in server group file", pgrpn); if (free_puser == TRUE) free(puser); free(pgrpn); return(PBSE_BADGRP); /* no such group */ } if (gpent->gr_gid != pwent->pw_gid) { /* not primary group */ pmem = gpent->gr_mem; while (*pmem != NULL) { if (!strcmp(puser, *pmem)) break; ++pmem; } if (*pmem == NULL) { /* requested group not allowed */ snprintf(log_buf,sizeof(log_buf), "user %s is not a member of group %s in server password file", puser, pgrpn); log_err(-1, __func__, log_buf); if (EMsg != NULL) snprintf(EMsg, 1024, "%s",log_buf); if (free_puser == TRUE) free(puser); free(pgrpn); return(PBSE_BADGRP); /* user not in group */ } } } /* END if ((pgrpn = getegroup(pjob,pattr))) */ /* set new group */ pattr = attrry + JOB_ATR_egroup; job_attr_def[JOB_ATR_egroup].at_free(pattr); job_attr_def[JOB_ATR_egroup].at_decode(pattr, NULL, NULL, pgrpn, 0); pattr->at_flags |= addflags; /* SUCCESS */ if (free_puser == TRUE) free(puser); if (free_pgrpn == TRUE) free(pgrpn); return(0); } /* END set_jobexid() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_unlock(pa->ai_mutex); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
int process_alps_status( const char *nd_name, std::vector<std::string> &status_info) { const char *ccu_p = NULL; char *current_node_id = NULL; struct pbsnode *parent; struct pbsnode *current = NULL; #ifdef PENABLE_LINUX_CGROUPS int numa_nodes = 0; int sockets = 0; #endif std::string temp; container::item_container<const char *> rsv_ht; char log_buf[LOCAL_LOG_BUF_SIZE]; /* if we can't find the parent node, ignore the update */ if ((parent = find_nodebyname(nd_name)) == NULL) return(PBSE_NONE); /* loop over each string */ for (unsigned int i = 0; i < status_info.size(); i++) { const char *str = status_info[i].c_str(); if (!strncmp(str, "node=", strlen("node="))) { if (i != 0) { if (current != NULL) save_node_status(current, temp); temp.clear(); } if ((current = determine_node_from_str(str, parent, current)) == NULL) break; else { #ifdef PENABLE_LINUX_CGROUPS sockets = 0; numa_nodes = 0; #endif continue; } } if (current == NULL) continue; /* process the gpu status information separately */ if (!strcmp(CRAY_GPU_STATUS_START, str)) { process_gpu_status(current, i, status_info); continue; } else if (!strncmp(reservation_id, str, strlen(reservation_id))) { const char *just_rsv_id = str + strlen(reservation_id); rsv_ht.lock(); if (rsv_ht.find(just_rsv_id) == NULL) { rsv_ht.insert(just_rsv_id,just_rsv_id); rsv_ht.unlock(); /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ parent->unlock_node(__func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->get_name()); current->unlock_node(__func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) { /* reporter node disappeared - this shouldn't be possible */ log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation"); free(current_node_id); return(PBSE_NONE); } if ((current = find_node_in_allnodes(parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ parent->unlock_node(__func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); free(current_node_id); return(PBSE_NONE); } free(current_node_id); current_node_id = NULL; } else { rsv_ht.unlock(); } } /* save this as is to the status strings */ else { if (temp.size() > 0) temp += ","; temp += str; } /* perform any special processing */ if (!strncmp(str, ccu_eq, ac_ccu_eq_len)) { /* save compute unit count in case we need it */ /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */ /* for the node */ ccu_p = str; } else if (!strncmp(str, cproc_eq, ac_cproc_eq_len)) { int ncpus; long svr_nppcu_value = 0; /* * Get the server nppcu value which determines how Hyper-Threaded * cores are reported. When server nppcu value is: * * 0 - Let ALPS choose whether or not to use Hyper-Threaded cores * (report all cores) * 1 - Do not use Hyper-Threaded cores * (report only physical core (compute unit count) * 2 - Use Hyper-Threaded cores * (report all cores) */ get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value); if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL) { /* no HT (nppcu==1), so use compute unit count */ ncpus = atoi(ccu_p + ac_ccu_eq_len); /* use CPROC value if we are using APBASIL protocol < 1.3 */ if (ncpus == 0) ncpus = atoi(str + ac_cproc_eq_len); /* reset the pointer */ ccu_p = NULL; } else { /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */ ncpus = atoi(str + ac_cproc_eq_len); } set_ncpus(current, parent, ncpus); #ifdef PENABLE_LINUX_CGROUPS if (numa_nodes == 0) numa_nodes = 1; if ((current->nd_layout.is_initialized() == false) || (current->nd_layout.getTotalThreads() != current->nd_slots.get_total_execution_slots())) { Machine m(current->nd_slots.get_total_execution_slots(), numa_nodes, sockets); current->nd_layout = m; } #endif } else if (!strncmp(str, state, strlen(state))) { set_state(current, str); } #ifdef PENABLE_LINUX_CGROUPS else if (!strncmp(str, "totmem", 6)) { set_total_memory(current, str); } else if (!strncmp(str, numas, 10)) { // 11 is strlen("numa_nodes=") numa_nodes = strtol(str + 11, NULL, 10); } else if (!strncmp(str, "socket", 6)) { // 7 is strlen("socket=") sockets = strtol(str + 7, NULL, 10); } #endif } /* END processing the status update */ if (current != NULL) { save_node_status(current, temp); current->unlock_node(__func__, NULL, LOGLEVEL); } parent->unlock_node(__func__, NULL, LOGLEVEL); return(PBSE_NONE); } /* END process_alps_status() */