END_TEST START_TEST(test_duplicate_request) { batch_request *preq = (batch_request *)calloc(1, sizeof(batch_request)); batch_request *dup; alloc_work = 0; fail_unless(duplicate_request(preq) == NULL); alloc_work = 1; preq->rq_perm = 1; strcpy(preq->rq_user, "dbeer"); strcpy(preq->rq_host, "napali"); preq->rq_extend = strdup("tom"); preq->rq_type = PBS_BATCH_RunJob; preq->rq_ind.rq_run.rq_destin = strdup("napali"); dup = duplicate_request(preq); fail_unless(dup != NULL); fail_unless(!strcmp(dup->rq_extend, "tom")); fail_unless(!strcmp(dup->rq_user, "dbeer")); fail_unless(!strcmp(dup->rq_host, "napali")); fail_unless(!strcmp(dup->rq_extend, "tom")); fail_unless(!strcmp(dup->rq_ind.rq_run.rq_destin, "napali")); }
void *req_modifyjob( batch_request *preq) /* I */ { job *pjob; svrattrl *plist; char log_buf[LOCAL_LOG_BUF_SIZE]; pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq); if (pjob == NULL) { return(NULL); } mutex_mgr job_mutex(pjob->ji_mutex, true); plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); if (plist == NULL) { /* nothing to do */ reply_ack(preq); /* SUCCESS */ return(NULL); } job_mutex.unlock(); /* If async modify, reply now; otherwise reply is handled later */ if (preq->rq_type == PBS_BATCH_AsyModifyJob) { /* reply_ack will free preq. We need to copy it before we call reply_ack */ batch_request *new_preq; new_preq = duplicate_request(preq, -1); if (new_preq == NULL) { sprintf(log_buf, "failed to duplicate batch request"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(NULL); } get_batch_request_id(new_preq); reply_ack(preq); new_preq->rq_noreply = TRUE; /* set for no more replies */ enqueue_threadpool_request((void *(*)(void *))modify_job_work, new_preq); } else modify_job_work(preq); return(NULL); } /* END req_modifyjob() */
int modify_whole_array( job_array *pa, /* I/O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { int i; int rc = PBSE_NONE; int modify_job_rc = PBSE_NONE; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */ batch_request *array_req = duplicate_request(preq, i); mutex_mgr job_mutex(pjob->ji_mutex, true); pthread_mutex_unlock(pa->ai_mutex); array_req->rq_noreply = TRUE; rc = modify_job((void **)&pjob, plist, array_req, checkpoint_req, NO_MOM_RELAY); if (rc != PBSE_NONE) { modify_job_rc = rc; } pa = get_jobs_array(&pjob); if (pa == NULL) { if (pjob == NULL) job_mutex.set_lock_on_exit(false); return(PBSE_JOB_RECYCLED); } if (pjob == NULL) { pa->job_ids[i] = NULL; job_mutex.set_lock_on_exit(false); continue; } } } /* END foreach job in array */ return(modify_job_rc); } /* END modify_whole_array() */
void queue_a_retry_task( batch_request *preq, /* I */ void (*replyfunc)(struct work_task *)) /* I */ { /* create a new batch_request because preq is going to be freed when issue_to_svr returns success */ batch_request *new_preq = duplicate_request(preq, -1); struct work_task *pwt; get_batch_request_id(new_preq); pwt = set_task(WORK_Timed, (time(NULL) + PBS_NET_RETRY_TIME), reissue_to_svr, new_preq->rq_id, TRUE); pwt->wt_parmfunc = replyfunc; pthread_mutex_unlock(pwt->wt_mutex); } /* END queue_a_retry_task() */
int req_deletejob( struct batch_request *preq) /* I */ { char *Msg = NULL; struct batch_request *preq_tmp = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; /* check if we are getting a purgecomplete from scheduler */ if (preq->rq_extend != NULL) { if (!strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP))) { /* purge_completed_jobs will respond with either an ack or reject */ purge_completed_jobs(preq); return(PBSE_NONE); } else if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) && strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) && strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr))) { /* have text message in request extension, add it */ Msg = preq->rq_extend; /* Message capability is only for operators and managers. * Check if request is authorized */ if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, "must have operator or manager privilege to use -m parameter"); return(PBSE_NONE); } } /* check if we are getting a asynchronous delete */ else if (!strncmp(preq->rq_extend,delasyncstr,strlen(delasyncstr))) { /* * Respond with an ack now instead of after MOM processing * Create a new batch request and fill it in. It will be freed by reply_ack */ snprintf(log_buf,sizeof(log_buf), "Deleting job asynchronously"); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_delete.rq_objname,log_buf); preq_tmp = duplicate_request(preq); } } if (strcasecmp(preq->rq_ind.rq_delete.rq_objname,"all") == 0) { handle_delete_all(preq, preq_tmp, Msg); } else { handle_single_delete(preq, preq_tmp, Msg); } return(PBSE_NONE); } /* END req_deletejob() */
int handle_delete_all( struct batch_request *preq, struct batch_request *preq_tmp, char *Msg) { /* don't use the actual request so we can reply about all of the jobs */ struct batch_request *preq_dup = duplicate_request(preq); job *pjob; int iter = -1; int failed_deletes = 0; int total_jobs = 0; int rc = PBSE_NONE; char tmpLine[MAXLINE]; preq_dup->rq_noreply = TRUE; if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ } while ((pjob = next_job(&alljobs, &iter)) != NULL) { if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS) { continue; } if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } total_jobs++; /* mutex is freed below */ if (rc == PBSE_NONE) { if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE) reply_ack(preq_dup); /* mark this as NULL because it has been freed */ preq_dup = NULL; } if (rc != PURGE_SUCCESS) { /* duplicate the preq so we don't have a problem with double frees */ preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; if ((rc == MOM_DELETE) || (rc == ROUTE_DELETE)) failed_deletes++; } } if (failed_deletes == 0) { reply_ack(preq); /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() * functions have been called */ if (rc == PURGE_SUCCESS) { free_br(preq_dup); preq_dup = NULL; } } else { snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs", failed_deletes, total_jobs); req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); } /* preq_dup happens at the end of the loop, so free the extra one if * it is there */ if (preq_dup != NULL) free_br(preq_dup); return(PBSE_NONE); } /* END handle_delete_all() */
void *req_checkpointjob( batch_request *preq) /* I */ { job *pjob; int rc; pbs_attribute *pattr; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL) { return(NULL); } mutex_mgr job_mutex(pjob->ji_mutex, true); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); free_br(dup_req); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pjob = NULL; } else job_mutex.set_unlock_on_exit(false); process_checkpoint_reply(dup_req); } } else { /* Job does not have checkpointing enabled, so reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable"); } return(NULL); } /* END req_checkpointjob() */
int req_holdjob( batch_request *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ /* ** The jobid in the request always have the server suffix attached ** which is dropped when the server attribute ** 'display_job_server_suffix' is FALSE and so will in the MOM's. ** Therefore, it must be passed as the server to the MOM so she can ** find it to hold. */ if (strncmp(pjob->ji_qs.ji_jobid, preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID)) snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", pjob->ji_qs.ji_jobid); if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, "relay to mom failed"); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); pjob = NULL; reply_ack(preq); } else job_mutex.set_unlock_on_exit(false); process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } return(PBSE_NONE); } /* END req_holdjob() */
int req_signaljob( batch_request *preq) /* I */ { job *pjob; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; /* preq free'd in error cases */ if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* the job must be running */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL); return(PBSE_NONE); } /* Special pseudo signals for suspend and resume require op/mgr */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) || !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND)) { if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0) { /* for suspend/resume, must be mgr/op */ req_reject(PBSE_PERM, 0, preq, NULL, NULL); return(PBSE_NONE); } } /* save job ptr for post_signal_req() */ preq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* FIXME: need a race-free check for available free subnodes before * resuming a suspended job */ #ifdef DONOTSUSPINTJOB /* interactive jobs don't resume correctly so don't allow a suspend */ if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) && (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0)) { req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL); return(PBSE_NONE); } #endif if (LOGLEVEL >= 6) { char ipstr[128]; sprintf(log_buf, "relaying signal request to mom %s", netaddr_long(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,ipstr)); log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf); } /* send reply for asynchronous suspend */ if (preq->rq_type == PBS_BATCH_AsySignalJob) { /* reply_ack will free preq. We need to copy it before we call reply_ack */ batch_request *new_preq; new_preq = duplicate_request(preq, -1); if (new_preq == NULL) { sprintf(log_buf, "failed to duplicate batch request"); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(PBSE_MEM_MALLOC); } get_batch_request_id(new_preq); reply_ack(new_preq); preq->rq_noreply = TRUE; } /* pass the request on to MOM */ if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "can not allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else { rc = relay_to_mom(&pjob, dup_req, NULL); if (pjob != NULL) job_mutex.unlock(); else job_mutex.set_unlock_on_exit(false); if (rc != PBSE_NONE) { free_br(dup_req); req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */ } else { post_signal_req(dup_req); free_br(preq); } } /* If successful we ack after mom replies to us, we pick up in post_signal_req() */ return(PBSE_NONE); } /* END req_signaljob() */
void *delete_all_work( void *vp) { batch_request *preq = (batch_request *)vp; if (qdel_all_tracker.start_deleting_all_if_possible(preq->rq_user, preq->rq_perm) == false) { reply_ack(preq); return(NULL); } batch_request *preq_dup = duplicate_request(preq); job *pjob; all_jobs_iterator *iter = NULL; int failed_deletes = 0; int total_jobs = 0; int rc = PBSE_NONE; char tmpLine[MAXLINE]; char *Msg = preq->rq_extend; alljobs.lock(); iter = alljobs.get_iterator(); alljobs.unlock(); while ((pjob = next_job(&alljobs, iter)) != NULL) { // use mutex manager to make sure job mutex locks are properly handled at exit mutex_mgr job_mutex(pjob->ji_mutex, true); if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS) { job_mutex.set_unlock_on_exit(false); continue; } if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { job_mutex.unlock(); if(rc == -1) { //forced_jobpurge freed preq_dup so reallocate it. preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; } continue; } total_jobs++; /* mutex is freed below */ if (rc == PBSE_NONE) { if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE) { // execute_job_delete() handles mutex so don't unlock on exit job_mutex.set_unlock_on_exit(false); reply_ack(preq_dup); } /* preq_dup has been freed at this point. Either reallocate it or set it to NULL*/ if (rc == PURGE_SUCCESS) { preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; } else preq_dup = NULL; } if (rc != PURGE_SUCCESS) { /* duplicate the preq so we don't have a problem with double frees */ preq_dup = duplicate_request(preq); preq_dup->rq_noreply = TRUE; if ((rc == MOM_DELETE) || (rc == ROUTE_DELETE)) failed_deletes++; } } delete iter; qdel_all_tracker.done_deleting_all(preq->rq_user, preq->rq_perm); if (failed_deletes == 0) { reply_ack(preq); /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() * functions have been called */ if (rc == PURGE_SUCCESS) { free_br(preq_dup); preq_dup = NULL; } } else { snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs", failed_deletes, total_jobs); req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine); } /* preq_dup happens at the end of the loop, so free the extra one if * it is there */ if (preq_dup != NULL) free_br(preq_dup); return(NULL); } /* END delete_all_work() */