int queue_cpy_del(struct queue_head *from, struct queue_head *to) { lock_queue(from); lock_queue(to); to->head = from->head; to->tail = from->tail; from->head = (queue_body_t *) QUEUE_HEAD_NULL; from->tail = (queue_body_t *) QUEUE_TAIL_NULL; to->curLength = from->curLength; from->curLength = 0; unlock_queue(to); unlock_queue(from); return 0; }
void gpu_worker(void *arg) { hs_worker *worker_arg = (hs_worker *) arg; bind_to_cpu(worker_arg); init_cuda(worker_arg->device_id); // printf("GPU I am id:%d\n", worker_arg->worker_id); pthread_mutex_lock(&worker_arg->mutex); worker_arg->initialized = 1; pthread_cond_signal(&worker_arg->ready); pthread_mutex_unlock(&worker_arg->mutex); _task_t task; while (is_running()) { lock_queue(worker_arg->task_queue); task = pop_task(worker_arg->task_queue); if (task == NULL) { if (is_running()) sleep_worker(worker_arg); unlock_queue(worker_arg->task_queue); continue; } unlock_queue(worker_arg->task_queue); if ((task->task->arch_type & worker_arg->arch) != worker_arg->arch) { push_task(worker_arg->task_queue, task); continue; } execute_task(worker_arg, task); } deinit_cuda(); pthread_exit((void*) 0); }
static work_q_item_t * create_work_item(void) { work_q_item_t *item; /* Try to reuse item from free item queue */ lock_queue(&g_free_q); if ((item = (work_q_item_t *)sq_remfirst(&(g_free_q.q)))) { g_free_q.size--; } unlock_queue(&g_free_q); /* If we there weren't any free items then obtain memory for a new ones */ if (item == NULL) { item = (work_q_item_t *)malloc(k_work_item_allocation_chunk_size * sizeof(work_q_item_t)); if (item) { item->first = 1; lock_queue(&g_free_q); for (size_t i = 1; i < k_work_item_allocation_chunk_size; i++) { (item + i)->first = 0; sq_addfirst(&(item + i)->link, &(g_free_q.q)); } /* Update the queue size and potentially the maximum queue size */ g_free_q.size += k_work_item_allocation_chunk_size - 1; if (g_free_q.size > g_free_q.max_size) { g_free_q.max_size = g_free_q.size; } unlock_queue(&g_free_q); } } /* If we got one then lock the item*/ if (item) { px4_sem_init(&item->wait_sem, 1, 0); /* Caller will wait on this... initially locked */ } /* return the item pointer, or NULL if all failed */ return item; }
void save_queues() { struct pbs_queue *pque; int iter = -1; while ((pque = next_queue(&svr_queues, &iter)) != NULL) { que_save(pque); unlock_queue(pque, __func__, NULL, 0); } } /* END save_queues() */
pbs_queue *find_queuebyname( char *quename) /* I */ { char *pc; pbs_queue *pque = NULL; char qname[PBS_MAXDEST + 1]; char log_buf[LOCAL_LOG_BUF_SIZE+1]; int i; snprintf(qname, sizeof(qname), "%s", quename); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s", quename); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } pc = strchr(qname, (int)'@'); /* strip off server (fragment) */ if (pc != NULL) *pc = '\0'; lock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL); i = get_value_hash(svr_queues.ht,qname); if (i >= 0) { pque = svr_queues.ra->slots[i].item; } if (pque != NULL) lock_queue(pque, __func__, NULL, LOGLEVEL); unlock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL); if (pque != NULL) { if (pque->q_being_recycled != FALSE) { unlock_queue(pque, __func__, "recycled queue", LOGLEVEL); pque = NULL; } } if (pc != NULL) *pc = '@'; /* restore '@' server portion */ return(pque); } /* END find_queuebyname() */
static inline void destroy_work_item(work_q_item_t *item) { sem_destroy(&item->wait_sem); /* Destroy the item lock */ /* Return the item to the free item queue for later reuse */ lock_queue(&g_free_q); sq_addfirst(&item->link, &(g_free_q.q)); /* Update the queue size and potentially the maximum queue size */ if (++g_free_q.size > g_free_q.max_size) g_free_q.max_size = g_free_q.size; unlock_queue(&g_free_q); }
static inline work_q_item_t * dequeue_work_item(void) { work_q_item_t *work; /* retrieve the 1st item on the work queue */ lock_queue(&g_work_q); if ((work = (work_q_item_t *)sq_remfirst(&g_work_q.q))) g_work_q.size--; unlock_queue(&g_work_q); return work; }
pbs_queue *find_queuebyname( char *quename) /* I */ { char *pc; pbs_queue *pque = NULL; char qname[PBS_MAXDEST + 1]; int i; snprintf(qname, sizeof(qname), "%s", quename); pc = strchr(qname, (int)'@'); /* strip off server (fragment) */ if (pc != NULL) *pc = '\0'; pthread_mutex_lock(svr_queues.allques_mutex); i = get_value_hash(svr_queues.ht,qname); if (i >= 0) { pque = svr_queues.ra->slots[i].item; } if (pque != NULL) lock_queue(pque, __func__, NULL, LOGLEVEL); pthread_mutex_unlock(svr_queues.allques_mutex); if (pque != NULL) { if (pque->q_being_recycled != FALSE) { unlock_queue(pque, __func__, "recycled queue", LOGLEVEL); pque = NULL; } } if (pc != NULL) *pc = '@'; /* restore '@' server portion */ return(pque); } /* END find_queuebyname() */
void que_free( pbs_queue *pq, int sv_qs_mutex_held) { int i; pbs_attribute *pattr; attribute_def *pdef; /* remove any calloc working pbs_attribute space */ for (i = 0;i < QA_ATR_LAST;i++) { pdef = &que_attr_def[i]; pattr = &pq->qu_attr[i]; pdef->at_free(pattr); /* remove any acl lists associated with the queue */ if (pdef->at_type == ATR_TYPE_ACL) { pattr->at_flags |= ATR_VFLAG_MODIFY; save_acl(pattr, pdef, pdef->at_name, pq->qu_qs.qu_name); } } /* now free the main structure */ if (sv_qs_mutex_held == FALSE) lock_sv_qs_mutex(server.sv_qs_mutex, __func__); server.sv_qs.sv_numque--; if (sv_qs_mutex_held == FALSE) unlock_sv_qs_mutex(server.sv_qs_mutex, __func__); free_user_info_holder(pq->qu_uih); remove_queue(&svr_queues, pq); pq->q_being_recycled = TRUE; insert_into_queue_recycler(pq); unlock_queue(pq, "que_free", NULL, LOGLEVEL); return; } /* END que_free() */
pbs_queue *lock_queue_with_job_held( pbs_queue *pque, job **pjob_ptr) { char jobid[PBS_MAXSVRJOBID + 1]; job *pjob = *pjob_ptr; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pque != NULL) { if (pthread_mutex_trylock(pque->qu_mutex)) { /* if fail */ strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); lock_queue(pque, __func__, NULL, LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) == NULL) { unlock_queue(pque, __func__, NULL, LOGLEVEL); pque = NULL; *pjob_ptr = NULL; } } else { if (LOGLEVEL >= 10) { snprintf(log_buf, sizeof(log_buf), "try lock succeeded for queue %s on job %s", pque->qu_qs.qu_name, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } } } return(pque); } /* END lock_queue_with_job_held() */
/** * attempt_delete() * deletes a job differently depending on the job's state * * @return TRUE if the job was deleted, FALSE if skipped * @param pjob - a pointer to the job being handled */ int attempt_delete( void *j) /* I */ { int skipped = FALSE; int release_mutex = TRUE; job *pjob; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; /* job considered deleted if null */ if (j == NULL) return(TRUE); pjob = (job *)j; if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* I'm not sure if this is still possible since the thread * waits on the job to finish transmiting, but I'll leave * this part here --dbeer */ skipped = TRUE; return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */ else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { /* we'll wait for the mom to get this job, then delete it */ skipped = TRUE; } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* set up nanny */ if (pjob->ji_has_delete_nanny == FALSE) { apply_job_delete_nanny(pjob, time_now + 60); /* need to issue a signal to the mom, but we don't want to sent an ack to the * client when the mom replies */ issue_signal(&pjob, "SIGTERM", post_delete, NULL); } if (pjob != NULL) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return(!skipped); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); pjob->ji_momhandle = -1; /* force new connection */ if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, NULL); release_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); } if (pjob != NULL) { pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else release_mutex = FALSE; } if (release_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); return(!skipped); } /* END attempt_delete() */
int modify_job_attr( job *pjob, /* I (modified) */ svrattrl *plist, /* I */ int perm, int *bad) /* O */ { int allow_unkn = -1; long i; pbs_attribute newattr[JOB_ATR_LAST]; pbs_attribute *pattr; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pque->qu_qs.qu_type != QTYPE_Execution) allow_unkn = JOB_ATR_UNKN; unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob->ji_parent_job != NULL) { allow_unkn = JOB_ATR_UNKN; } else { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 5"); return(PBSE_JOBNOTFOUND); } pattr = pjob->ji_wattr; /* call attr_atomic_set to decode and set a copy of the attributes */ rc = attr_atomic_set( plist, /* I */ pattr, /* I */ newattr, /* O */ job_attr_def, /* I */ JOB_ATR_LAST, allow_unkn, /* I */ perm, /* I */ bad); /* O */ /* if resource limits are being changed ... */ if ((rc == 0) && (newattr[JOB_ATR_resource].at_flags & ATR_VFLAG_SET)) { if ((perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) { /* If job is running, only manager/operator can raise limits */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { long lim = TRUE; int comp_resc_lt; get_svr_attr_l(SRV_ATR_QCQLimits, &lim); comp_resc_lt = comp_resc2(&pjob->ji_wattr[JOB_ATR_resource], &newattr[JOB_ATR_resource], lim, NULL, LESS); if (comp_resc_lt != 0) { rc = PBSE_PERM; } } /* Also check against queue and system limits */ if (rc == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { rc = chk_resc_limits( &newattr[JOB_ATR_resource], pque, NULL); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 6"); return(PBSE_JOBNOTFOUND); } else rc = PBSE_QUENOTAVAILABLE; } } } /* END if ((rc == 0) && ...) */ /* special check on permissions for hold */ if ((rc == 0) && (newattr[JOB_ATR_hold].at_flags & ATR_VFLAG_MODIFY)) { i = newattr[JOB_ATR_hold].at_val.at_long ^ (pattr + JOB_ATR_hold)->at_val.at_long; rc = chk_hold_priv(i, perm); } if (rc == 0) { for (i = 0;i < JOB_ATR_LAST;i++) { if (newattr[i].at_flags & ATR_VFLAG_MODIFY) { if (job_attr_def[i].at_action) { rc = job_attr_def[i].at_action( &newattr[i], pjob, ATR_ACTION_ALTER); if (rc) break; } } } /* END for (i) */ if ((rc == 0) && ((newattr[JOB_ATR_userlst].at_flags & ATR_VFLAG_MODIFY) || (newattr[JOB_ATR_grouplst].at_flags & ATR_VFLAG_MODIFY))) { /* need to reset execution uid and gid */ rc = set_jobexid(pjob, newattr, NULL); } if ((rc == 0) && (newattr[JOB_ATR_outpath].at_flags & ATR_VFLAG_MODIFY)) { /* need to recheck if JOB_ATR_outpath is a special case of host only */ if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == ':') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_outpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'o'); /* don't call free_dynamic_string() */ free(ds); } /* * if the output path was specified and ends with a '/' * then append the standard file name */ else if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == '/') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] = '\0'; replace_attr_string(&newattr[JOB_ATR_outpath], (add_std_filename(pjob, newattr[JOB_ATR_outpath].at_val.at_str, (int)'o', ds))); /* don't call free_dynamic_string because() we still want to use the allocated string */ free(ds); } } if ((rc == 0) && (newattr[JOB_ATR_errpath].at_flags & ATR_VFLAG_MODIFY)) { /* need to recheck if JOB_ATR_errpath is a special case of host only */ if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == ':') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_errpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'e'); /* don't call free_dynamic_string() */ free(ds); } /* * if the error path was specified and ends with a '/' * then append the standard file name */ else if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == '/') { dynamic_string *ds = get_dynamic_string(-1, NULL); newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] = '\0'; replace_attr_string(&newattr[JOB_ATR_errpath], (add_std_filename(pjob, newattr[JOB_ATR_errpath].at_val.at_str,(int)'e', ds))); /* don't call free_dynamic_string() */ free(ds); } } } /* END if (rc == 0) */ if (rc != 0) { for (i = 0;i < JOB_ATR_LAST;i++) job_attr_def[i].at_free(newattr + i); /* FAILURE */ return(rc); } /* END if (rc != 0) */ /* OK, now copy the new values into the job attribute array */ for (i = 0;i < JOB_ATR_LAST;i++) { if (newattr[i].at_flags & ATR_VFLAG_MODIFY) { if (LOGLEVEL >= 7) { sprintf(log_buf, "attr %s modified", job_attr_def[i].at_name); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } job_attr_def[i].at_free(pattr + i); if ((newattr[i].at_type == ATR_TYPE_LIST) || (newattr[i].at_type == ATR_TYPE_RESC)) { list_move( &newattr[i].at_val.at_list, &(pattr + i)->at_val.at_list); } else { *(pattr + i) = newattr[i]; } (pattr + i)->at_flags = newattr[i].at_flags; } } /* END for (i) */ /* note, the newattr[] attributes are on the stack, they go away automatically */ pjob->ji_modified = 1; return(0); } /* END modify_job_attr() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter; time_t time_now = time(NULL); long poll_jobs = 0; char job_id[PBS_MAXSVRJOBID+1]; int job_substate = -1; time_t job_momstattime = -1; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } iter = -1; get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if (!poll_jobs) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = svr_find_job(cntl->sc_jobid, FALSE); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); } else if (type == tjstQueue) { pjob = next_job(cntl->sc_pque->qu_jobs,&iter); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } } job_array_index++; } } else { pjob = next_job(&alljobs,&iter); } } /* END if (pjob == NULL) */ else { strcpy(job_id, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); break; } } } } else pjob = next_job(&alljobs,&iter); } if (pjob == NULL) break; strcpy(job_id, pjob->ji_qs.ji_jobid); job_substate = pjob->ji_qs.ji_substate; job_momstattime = pjob->ji_momstat; strcpy(cntl->sc_jobid, job_id); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); pjob = NULL; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((job_substate == JOB_SUBSTATE_RUNNING) && ((time_now - job_momstattime) > JobStatRate)) { /* go to MOM for status */ if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC) break; if (rc != 0) { pjob = svr_find_job(job_id, FALSE); rc = 0; continue; } if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return; /* will pick up after mom replies */ } } /* END while(1) */ if (rc != 0) { if (pa != NULL) unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; int iter = -1; /* loop through all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); iter = -1; while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ if (pa != NULL) unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); reply_send_svr(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; if (pque->qu_qs.qu_type != QTYPE_Execution) { unlock_queue(pque, __func__, "not exec", LOGLEVEL); goto nextjob; } unlock_queue(pque, __func__, "exec", LOGLEVEL); } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,&iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,&iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,&iter); rc = 0; } /* END while (pjob != NULL) */ if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int req_stat_que( struct batch_request *preq) { char *name; pbs_queue *pque = NULL; struct batch_reply *preply; int rc = 0; int type = 0; char log_buf[LOCAL_LOG_BUF_SIZE+1]; /* * first, validate the name of the requested object, either * a queue, or null for all queues */ name = preq->rq_ind.rq_status.rq_id; if ((*name == '\0') || (*name == '@')) { type = 1; } else { pque = find_queuebyname(name); if (pque == NULL) { rc = PBSE_UNKQUE; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "can not locate queue %s", name); req_reject(rc, 0, preq, NULL, log_buf); return rc; } } preply = &preq->rq_reply; preply->brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preply->brp_un.brp_status); if (type == 0) { /* get status of the named queue */ rc = status_que(pque, preq, &preply->brp_un.brp_status); unlock_queue(pque, "req_stat_que", "type == 0", LOGLEVEL); } else { /* pque == NULL before next_queue */ int iter = -1; /* get status of all queues */ while ((pque = next_queue(&svr_queues,&iter)) != NULL) { rc = status_que(pque, preq, &preply->brp_un.brp_status); if (rc != 0) { if (rc != PBSE_PERM) { unlock_queue(pque, "req_stat_que", "break", LOGLEVEL); break; } rc = 0; } unlock_queue(pque, "req_stat_que", "end while", LOGLEVEL); } } if (rc != PBSE_NONE) { reply_free(preply); req_reject(PBSE_NOATTR, rc, preq, NULL, "status_queue failed"); } else { reply_send_svr(preq); } return rc; } /* END req_stat_que() */
int execute_job_delete( job *pjob, /* M */ char *Msg, /* I */ struct batch_request *preq) /* I */ { struct work_task *pwtnew; int rc; char *sigt = "SIGTERM"; int has_mutex = TRUE; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long force_cancel = FALSE; long array_compatible = FALSE; chk_job_req_permissions(&pjob,preq); if (pjob == NULL) { /* preq is rejected in chk_job_req_permissions here */ return(-1); } if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* see note in req_delete - not sure this is possible still, * but the deleted code is irrelevant now. I will leave this * part --dbeer */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return(-1); } if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 || pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 ) { /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */ /* retry in one second */ /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the job is being requeued. Wait until finished */ static time_t cycle_check_when = 0; static char cycle_check_jid[PBS_MAXSVRJOBID + 1]; if (cycle_check_when != 0) { if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) && (time_now - cycle_check_when > 10)) { /* state not updated after 10 seconds */ /* did the mom ever get it? delete it anyways... */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; goto jump; } if (time_now - cycle_check_when > 20) { /* give up after 20 seconds */ cycle_check_jid[0] = '\0'; cycle_check_when = 0; } } /* END if (cycle_check_when != 0) */ if (cycle_check_when == 0) { /* new PRERUN job located */ cycle_check_when = time_now; strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid); } sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); if (pwtnew == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(-1); } else { return(ROUTE_DELETE); } } /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */ jump: /* * Log delete and if requesting client is not job owner, send mail. */ sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host); /* NOTE: should annotate accounting record with extend message (NYI) */ account_record(PBS_ACCT_DEL, pjob, log_buf); sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* NOTE: should incorporate job delete message */ if (Msg != NULL) { /* have text message in request extension, add it */ strcat(log_buf, "\n"); strcat(log_buf, Msg); } if ((svr_chk_owner(preq, pjob) != 0) && (pjob->ji_has_delete_nanny == FALSE)) { /* only send email if owner did not delete job and job deleted has not been previously attempted */ svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf); /* * If we sent mail and already sent the extra message * then reset message so we don't trigger a redundant email * in job_abt() */ if (Msg != NULL) { Msg = NULL; } } if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, change restart comment if failed */ change_restart_comment_if_needed(pjob); } if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * setup a nanny task to make sure the job is actually deleted (see the * comments at job_delete_nanny()). */ if (pjob->ji_has_delete_nanny == TRUE) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress"); return(-1); } apply_job_delete_nanny(pjob, time_now + 60); /* * Send signal request to MOM. The server will automagically * pick up and "finish" off the client request when MOM replies. */ get_batch_request_id(preq); if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id)))) { /* cant send to MOM */ req_reject(rc, 0, preq, NULL, NULL); } /* normally will ack reply when mom responds */ if (pjob != NULL) { sprintf(log_buf, msg_delrunjobsig, sigt); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); } return(-1); } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* make a cleanup task if set */ get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel); if (force_cancel > 0) { char *dup_jobid = strdup(pjob->ji_qs.ji_jobid); set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE); } /* if configured, and this job didn't have a slot limit hold, free a job * held with the slot limit hold */ get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible); if ((array_compatible != FALSE) && ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)) { if ((pjob->ji_arraystruct != NULL) && (pjob->ji_is_array_template == FALSE)) { int i; int newstate; int newsub; job *tmp; job_array *pa = get_jobs_array(&pjob); if (pjob == NULL) return(-1); for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid)) continue; if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) { tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l; if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0) { tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET; } svr_evaljobstate(tmp, &newstate, &newsub, 1); svr_setjobstate(tmp, newstate, newsub, FALSE); job_save(tmp, SAVEJOB_FULL, 0); unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL); break; } unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL); } } if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocking ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_unlock(pa->ai_mutex); } } /* END MoabArrayCompatible check */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); has_mutex = FALSE; } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { pque->qu_numcompleted++; unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) { set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); } else has_mutex = FALSE; } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (has_mutex == TRUE) unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); return(PBSE_NONE); } /* END execute_job_delete() */
int get_parent_dest_queues( char *queue_parent_name, char *queue_dest_name, pbs_queue **parent, pbs_queue **dest, job **pjob_ptr) { pbs_queue *pque_parent; pbs_queue *pque_dest; char jobid[PBS_MAXSVRJOBID + 1]; char log_buf[LOCAL_LOG_BUF_SIZE + 1]; job *pjob = *pjob_ptr; int index_parent; int index_dest; int rc = PBSE_NONE; if (LOGLEVEL >= 7) { sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); unlock_queue(*parent, __func__, NULL, 0); *parent = NULL; *dest = NULL; lock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL); index_parent = get_value_hash(svr_queues.ht, queue_parent_name); index_dest = get_value_hash(svr_queues.ht, queue_dest_name); if ((index_parent < 0) || (index_dest < 0)) { rc = -1; } else { /* good path */ pque_parent = svr_queues.ra->slots[index_parent].item; pque_dest = svr_queues.ra->slots[index_dest].item; if ((pque_parent == NULL) || (pque_dest == NULL)) { rc = -1; } else { /* SUCCESS! */ lock_queue(pque_parent, __func__, NULL, 0); lock_queue(pque_dest, __func__, NULL, 0); *parent = pque_parent; *dest = pque_dest; rc = PBSE_NONE; } } unlock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL); if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL) rc = -1; return(rc); } /* END get_parent_dest_queues() */
int get_parent_dest_queues( char *queue_parent_name, char *queue_dest_name, pbs_queue **parent, pbs_queue **dest, job **pjob_ptr) { pbs_queue *pque_parent; pbs_queue *pque_dest; char jobid[PBS_MAXSVRJOBID + 1]; job *pjob = *pjob_ptr; int index_parent; int index_dest; int rc = PBSE_NONE; strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); unlock_queue(*parent, __func__, NULL, 0); *parent = NULL; *dest = NULL; pthread_mutex_lock(svr_queues.allques_mutex); index_parent = get_value_hash(svr_queues.ht, queue_parent_name); index_dest = get_value_hash(svr_queues.ht, queue_dest_name); if ((index_parent < 0) || (index_dest < 0)) { rc = -1; } else { /* good path */ pque_parent = svr_queues.ra->slots[index_parent].item; pque_dest = svr_queues.ra->slots[index_dest].item; if ((pque_parent == NULL) || (pque_dest == NULL)) { rc = -1; } else { /* SUCCESS! */ lock_queue(pque_parent, __func__, NULL, 0); lock_queue(pque_dest, __func__, NULL, 0); *parent = pque_parent; *dest = pque_dest; rc = PBSE_NONE; } } pthread_mutex_unlock(svr_queues.allques_mutex); if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL) rc = -1; return(rc); } /* END get_parent_dest_queues() */
int delete_inactive_job( job **pjob_ptr, const char *Msg) { job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (pjob_ptr == NULL) return(PBSE_BAD_PARAMETER); pjob = *pjob_ptr; if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); /* force new connection */ pjob->ji_momhandle = -1; if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) job_abt(&pjob, Msg); } else { /* * the job is not transitting (though it may have been) and * is not running, so put in into a complete state. */ struct pbs_queue *pque; int KeepSeconds = 0; svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if ((pque = get_jobs_queue(&pjob)) != NULL) { unlock_queue(pque, __func__, NULL, LOGLEVEL); if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } pthread_mutex_lock(server.sv_attr_mutex); KeepSeconds = attr_ifelse_long( &pque->qu_attr[QE_ATR_KeepCompleted], &server.sv_attr[SRV_ATR_KeepCompleted], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else KeepSeconds = 0; if (pjob != NULL) set_task(WORK_Timed, time(NULL) + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */ if (pjob == NULL) *pjob_ptr = NULL; return(PBSE_NONE); } /* END delete_inactive_job() */
int req_stat_job( struct batch_request *preq) /* ptr to the decoded request */ { struct stat_cntl *cntl; /* see svrfunc.h */ char *name; job *pjob = NULL; pbs_queue *pque = NULL; int rc = PBSE_NONE; char log_buf[LOCAL_LOG_BUF_SIZE]; enum TJobStatTypeEnum type = tjstNONE; /* * first, validate the name of the requested object, either * a job, a queue, or the whole server. */ if (LOGLEVEL >= 7) { sprintf(log_buf, "note"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } /* FORMAT: name = { <JOBID> | <QUEUEID> | '' } */ name = preq->rq_ind.rq_status.rq_id; if (preq->rq_extend != NULL) { /* evaluate pbs_job_stat() 'extension' field */ if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated"))) { /* truncate response by 'max_report' */ type = tjstTruncatedServer; } else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays"))) { type = tjstSummarizeArraysServer; } } /* END if (preq->rq_extend != NULL) */ if (isdigit((int)*name)) { /* status a single job */ if (is_array(name)) { if (type != tjstSummarizeArraysServer) { type = tjstArray; } } else { type = tjstJob; if ((pjob = svr_find_job(name, FALSE)) == NULL) { rc = PBSE_UNKJOBID; } else unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if (isalpha(name[0])) { if (type == tjstNONE) type = tjstQueue; else if (type == tjstSummarizeArraysServer) type = tjstSummarizeArraysQueue; else type = tjstTruncatedQueue; /* if found, this mutex is released later */ if ((pque = find_queuebyname(name)) == NULL) { rc = PBSE_UNKQUE; } } else if ((*name == '\0') || (*name == '@')) { /* status all jobs at server */ if (type == tjstNONE) type = tjstServer; } else { rc = PBSE_IVALREQ; } if (rc != 0) { /* is invalid - an error */ req_reject(rc, 0, preq, NULL, NULL); return(rc); } preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status; CLEAR_HEAD(preq->rq_reply.brp_un.brp_status); cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl)); if (cntl == NULL) { if (pque != NULL) unlock_queue(pque, "req_stat_job", (char *)"no memory cntl", LOGLEVEL); req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL); return(PBSE_SYSTEM); } if ((type == tjstTruncatedQueue) || (type == tjstTruncatedServer)) { if (pque != NULL) { unlock_queue(pque, __func__, "", LOGLEVEL); pque = NULL; } } cntl->sc_type = (int)type; cntl->sc_conn = -1; cntl->sc_pque = pque; cntl->sc_origrq = preq; cntl->sc_post = req_stat_job_step2; cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */ req_stat_job_step2(cntl); /* go to step 2, see if running is current */ if (pque != NULL) unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL); free(cntl); return(PBSE_NONE); } /* END req_stat_job() */
void rerun_or_kill( job **pjob_ptr, /* I (modified/freed) */ char *text) /* I */ { long server_state = SV_STATE_DOWN; char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; job *pjob = *pjob_ptr; get_svr_attr_l(SRV_ATR_State, &server_state); if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long) { /* job is rerunable, mark it to be requeued */ issue_signal(&pjob, "SIGKILL", free_br, NULL); if (pjob != NULL) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; if ((pque = get_jobs_queue(&pjob)) != NULL) { snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text); unlock_queue(pque, __func__, NULL, LOGLEVEL); } } } else if (server_state != SV_STATE_SHUTDEL) { /* job not rerunable, immediate shutdown - kill it off */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text); /* need to record log message before purging job */ log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); job_abt(pjob_ptr, log_buf); return; } else { /* delayed shutdown, leave job running */ snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text); } if (pjob != NULL) { log_event( PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } return; } /* END rerun_or_kill() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (free'd on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int bad = 0; long DTime; /* delta time - only report full pbs_attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; all_jobs_iterator *iter; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ if (dpal == NULL) { /* build 'delta' pbs_attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); if (pa == NULL) { req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array"); return; } } { all_jobs *ajptr = NULL; if (type == tjstQueue) ajptr = cntl->sc_pque->qu_jobs; else if (type == tjstSummarizeArraysQueue) ajptr = cntl->sc_pque->qu_jobs_array_sum; else if (type == tjstSummarizeArraysServer) ajptr = &array_summary; else ajptr = &alljobs; ajptr->lock(); iter = ajptr->get_iterator(); ajptr->unlock(); } /* * now ready for part 3, building the status reply, * loop through again */ if ((type == tjstSummarizeArraysQueue) || (type == tjstSummarizeArraysServer)) { /* No array can be owned for these options */ update_array_statuses(); } if (type == tjstJob) pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE); else if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { job_array_index = -1; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; all_queues_iterator *iter = NULL; svr_queues.lock(); iter = svr_queues.get_iterator(); svr_queues.unlock(); /* loop through all queues */ while ((pque = next_queue(&svr_queues,iter)) != NULL) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ unlock_queue(pque, __func__, "ignore queue", LOGLEVEL); continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } sentJobCounter = 0; /* loop through jobs in queue */ if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); all_jobs_iterator *jobiter = NULL; pque->qu_jobs->lock(); jobiter = pque->qu_jobs->get_iterator(); pque->qu_jobs->unlock(); while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); unlock_queue(pque, __func__, "perm", LOGLEVEL); delete iter; return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL); } /* END foreach (pjob from pque) */ if (LOGLEVEL >= 5) { sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf); } unlock_queue(pque, __func__, "end while", LOGLEVEL); } /* END for (pque) */ reply_send_svr(preq); delete iter; return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { if (cntl->sc_pque != NULL) { if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } else { if (pa != NULL) pthread_mutex_unlock(pa->ai_mutex); pque = get_jobs_queue(&pjob); if (pa != NULL) pthread_mutex_lock(pa->ai_mutex); if ((pjob == NULL) || (pque == NULL)) goto nextjob; mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type != QTYPE_Execution) { goto nextjob; } } } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL); req_reject(rc, bad, preq, NULL, NULL); delete iter; return; } /* get next job */ nextjob: if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL); if (type == tjstJob) break; if (type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs,iter); rc = 0; } /* END while (pjob != NULL) */ delete iter; if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } reply_send_svr(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int acct_job( job *pjob, /* I */ dynamic_string *ds) /* O */ { int rc; long cray_enabled = FALSE; int resc_access_perm = READ_ONLY; char local_buf[MAXLINE*4]; pbs_queue *pque; tlist_head attrlist; svrattrl *pal; if (pjob == NULL) { return(PBSE_NONE); } CLEAR_HEAD(attrlist); /* user */ /* acct_job is only called from account_jobstr and account_jobend. BufSize should be PBS_ACCT_MAX_RCD + 1 in size. */ sprintf(local_buf, "user=%s ", pjob->ji_wattr[JOB_ATR_euser].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* group */ sprintf(local_buf, "group=%s ", pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* account */ if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "account=%s ", pjob->ji_wattr[JOB_ATR_account].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } /* job name */ sprintf(local_buf, "jobname=%s ", pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); if ((pque = get_jobs_queue(&pjob)) != NULL) { /* queue name */ sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name); unlock_queue(pque, __func__, NULL, LOGLEVEL); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1"); return(PBSE_JOBNOTFOUND); } /* create time */ sprintf(local_buf, "ctime=%ld ", pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* queued time */ sprintf(local_buf, "qtime=%ld ", pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* eligible time, how long ready to run */ sprintf(local_buf, "etime=%ld ", pjob->ji_wattr[JOB_ATR_etime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* execution start time */ sprintf(local_buf, "start=%ld ", (long)pjob->ji_qs.ji_stime); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* user */ sprintf(local_buf, "owner=%s ", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* For large clusters strings can get pretty long. We need to see if there is a need to allocate a bigger buffer */ /* execution host name */ if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) { append_dynamic_string(ds, "exec_host="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); } get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET)) { append_dynamic_string(ds, "login_node="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); } /* now encode the job's resource_list pbs_attribute */ job_attr_def[JOB_ATR_resource].at_encode( &pjob->ji_wattr[JOB_ATR_resource], &attrlist, job_attr_def[JOB_ATR_resource].at_name, NULL, ATR_ENCODE_CLIENT, resc_access_perm); while ((pal = GET_NEXT(attrlist)) != NULL) { /* exec_host can use a lot of buffer space. Use a dynamic string */ append_dynamic_string(ds, pal->al_name); if (pal->al_resc != NULL) { append_dynamic_string(ds, "."); append_dynamic_string(ds, pal->al_resc); } append_dynamic_string(ds, "="); append_dynamic_string(ds, pal->al_value); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); delete_link(&pal->al_link); free(pal); } /* END while (pal != NULL) */ #ifdef ATTR_X_ACCT /* x attributes */ if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "x=%s ", pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } #endif /* SUCCESS */ return(PBSE_NONE); } /* END acct_job() */
void set_resc_assigned( job *pjob, /* I */ enum batch_op op) /* INCR or DECR */ { resource *jobrsc; resource *pr; pbs_attribute *queru; resource_def *rscdef; pbs_attribute *sysru; pbs_queue *pque; char log_buf[LOCAL_LOG_BUF_SIZE]; if ((pjob == NULL)) return; if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pque->qu_qs.qu_type == QTYPE_Execution) { if (op == DECR) { /* if freeing completed job resources, ignore constraint (???) */ /* NO-OP */ } } else { snprintf(log_buf,sizeof(log_buf), "job %s isn't in an execution queue, can't modify resources\njob is in queue %s", pjob->ji_qs.ji_jobid, pque->qu_qs.qu_name); log_err(-1, __func__, log_buf); unlock_queue(pque, __func__, NULL, LOGLEVEL); return; } if (op == INCR) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* already added in */ } pjob->ji_qs.ji_svrflags |= JOB_SVFLG_RescAssn; } else if (op == DECR) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) == 0) { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* not currently included */ } pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn; } else { unlock_queue(pque, __func__, NULL, LOGLEVEL); return; /* invalid op */ } sysru = &server.sv_attr[SRV_ATR_resource_assn]; queru = &pque->qu_attr[QE_ATR_ResourceAssn]; jobrsc = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list); while (jobrsc != NULL) { rscdef = jobrsc->rs_defin; /* if resource usage is to be tracked */ if ((rscdef->rs_flags & ATR_DFLAG_RASSN) && (jobrsc->rs_value.at_flags & ATR_VFLAG_SET)) { /* update system pbs_attribute of resources assigned */ pr = find_resc_entry(sysru, rscdef); if (pr == NULL) { pr = add_resource_entry(sysru, rscdef); if (pr == NULL) { unlock_queue(pque, __func__, "sysru", LOGLEVEL); return; } } rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op); /* update queue pbs_attribute of resources assigned */ pr = find_resc_entry(queru, rscdef); if (pr == NULL) { pr = add_resource_entry(queru, rscdef); if (pr == NULL) { unlock_queue(pque, __func__, "queru", LOGLEVEL); return; } } rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op); } jobrsc = (resource *)GET_NEXT(jobrsc->rs_link); } /* END while (jobrsc != NULL) */ unlock_queue(pque, __func__, "success", LOGLEVEL); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 9"); } return; } /* END set_resc_assigned() */
void *queue_route( void *vp) { pbs_queue *pque; job *pjob = NULL; char *queue_name; char log_buf[LOCAL_LOG_BUF_SIZE]; int iter = -1; time_t time_now = time(NULL); queue_name = (char *)vp; if (queue_name == NULL) { sprintf(log_buf, "NULL queue name"); log_err(-1, __func__, log_buf); return(NULL); } if (LOGLEVEL >= 7) { snprintf(log_buf, sizeof(log_buf), "queue name: %s", queue_name); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf); } pthread_mutex_lock(reroute_job_mutex); pque = find_queuebyname(queue_name); if (pque == NULL) { sprintf(log_buf, "Could not find queue %s", queue_name); log_err(-1, __func__, log_buf); free(queue_name); pthread_mutex_unlock(reroute_job_mutex); return(NULL); } while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { /* the second condition says we only want to try if routing * has been tried once - this is to let req_commit have the * first crack at routing always */ unlock_queue(pque, __func__, (char *)NULL, 0); if ((pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now - ROUTE_RETRY_TIME) && (pjob->ji_qs.ji_un.ji_routet.ji_rteretry != 0)) { reroute_job(pjob, pque); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } else unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } free(queue_name); unlock_queue(pque, __func__, (char *)NULL, 0); pthread_mutex_unlock(reroute_job_mutex); return(NULL); } /* END queue_route() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
int get_parent_dest_queues( char *queue_parent_name, char *queue_dest_name, pbs_queue **parent, pbs_queue **dest, job **pjob_ptr) { pbs_queue *pque_parent; pbs_queue *pque_dest; char jobid[PBS_MAXSVRJOBID + 1]; char log_buf[LOCAL_LOG_BUF_SIZE + 1]; job *pjob = *pjob_ptr; int index_parent; int index_dest; int rc = PBSE_NONE; strcpy(jobid, pjob->ji_qs.ji_jobid); if ((queue_parent_name != NULL) && (queue_dest_name != NULL)) { if (!strcmp(queue_parent_name, queue_dest_name)) { /* parent and destination are the same. Job is already in destnation queue. return */ snprintf(log_buf, sizeof(log_buf), "parent and destination queues are the same: parent %s - dest %s. jobid: %s", queue_parent_name, queue_dest_name, jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(-1); } } else return(-1); unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); unlock_queue(*parent, __func__, (char *)NULL, 0); *parent = NULL; *dest = NULL; pthread_mutex_lock(svr_queues.allques_mutex); index_parent = get_value_hash(svr_queues.ht, queue_parent_name); index_dest = get_value_hash(svr_queues.ht, queue_dest_name); if ((index_parent < 0) || (index_dest < 0)) { rc = -1; } else { /* good path */ pque_parent = svr_queues.ra->slots[index_parent].item; pque_dest = svr_queues.ra->slots[index_dest].item; if ((pque_parent == NULL) || (pque_dest == NULL)) { rc = -1; } else { /* SUCCESS! */ lock_queue(pque_parent, __func__, (char *)NULL, 0); lock_queue(pque_dest, __func__, (char *)NULL, 0); *parent = pque_parent; *dest = pque_dest; rc = PBSE_NONE; } } pthread_mutex_unlock(svr_queues.allques_mutex); if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL) rc = -1; return(rc); } /* END get_parent_dest_queues() */