static void reissue_to_svr( struct work_task *pwt) { time_t time_now = time(NULL); char *br_id = pwt->wt_parm1; batch_request *preq = get_remove_batch_request(br_id); /* if not timed-out, retry send to remote server */ if (preq != NULL) { if (((time_now - preq->rq_time) > PBS_NET_RETRY_LIMIT) || (issue_to_svr(preq->rq_host, preq, pwt->wt_parmfunc) != PBSE_NONE)) { /* either timed-out or got hard error, tell post-function */ pwt->wt_aux = -1; /* seen as error by post function */ pwt->wt_event = -1; /* seen as connection by post func */ if (pwt->wt_parmfunc != NULL) ((void (*)())pwt->wt_parmfunc)(pwt); } } free(pwt->wt_mutex); free(pwt); } /* END reissue_to_svr() */
static void post_message_req( struct work_task *pwt) { struct batch_request *preq; char log_buf[LOCAL_LOG_BUF_SIZE]; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq has been hadnled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ sprintf(log_buf, msg_messagejob, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_message.rq_jid, log_buf); if (preq->rq_reply.brp_code) req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); else reply_ack(preq); } /* END post_message_req() */
static void post_delete_route( struct work_task *pwt) { batch_request *preq = get_remove_batch_request((char *)pwt->wt_parm1); if (preq != NULL) req_deletejob(preq); free(pwt->wt_mutex); free(pwt); return; }
void release_req( struct work_task *pwt) { batch_request *preq; char *br_id = pwt->wt_parm1; if ((preq = get_remove_batch_request(br_id)) != NULL) free_br(preq); if (pwt->wt_event != -1) svr_disconnect(pwt->wt_event); free(pwt->wt_mutex); free(pwt); } /* END release_req() */
void reissue_to_svr( struct work_task *pwt) { time_t time_now = time(NULL); char *br_id; batch_request *preq; char *serverName = NULL; if (pwt == NULL) return; br_id = (char *)pwt->wt_parm1; preq = get_remove_batch_request(br_id); /* if not timed-out, retry send to remote server */ if (preq != NULL) { if (preq->rq_host[0] != '\0') serverName = strdup(preq->rq_host); else { free(pwt->wt_mutex); free(pwt); return; } if (((time_now - preq->rq_time) > PBS_NET_RETRY_LIMIT) || (issue_to_svr(serverName, &preq, pwt->wt_parmfunc) != PBSE_NONE)) { /* either timed-out or got hard error, tell post-function */ pwt->wt_aux = -1; /* seen as error by post function */ pwt->wt_event = -1; /* seen as connection by post func */ if (pwt->wt_parmfunc != NULL) (* pwt->wt_parmfunc)(pwt); } } if (serverName) free(serverName); free(pwt->wt_mutex); free(pwt); } /* END reissue_to_svr() */
void chkpt_xfr_hold( struct work_task *ptask) { job *pjob; struct batch_request *preq; char log_buf[LOCAL_LOG_BUF_SIZE]; preq = get_remove_batch_request(ptask->wt_parm1); free(ptask->wt_mutex); free(ptask); if ((preq == NULL) || (preq->rq_extra == NULL)) return; if ((pjob = svr_find_job(preq->rq_extra, FALSE)) == NULL) return; if (LOGLEVEL >= 7) { sprintf(log_buf, "BLCR copy completed (state is %s-%s)", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate]); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } free_br(preq); set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } /* END chkpt_xfr_hold() */
static void process_checkpoint_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_manager.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { /* record that MOM has a checkpoint file */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */ reply_ack(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } /* END process_checkpoint_reply() */
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
static void post_job_delete_nanny( struct work_task *pwt) { struct batch_request *preq_sig; /* signal request to MOM */ int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
void delay_and_send_sig_kill( batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away, chk_job_request() calls req_reject() on failure */ return; } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { pjob_mutex.unlock(); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf); return; } pjob_mutex.unlock(); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); } // END delay_and_send_sig_kill()
void array_delete_wt( struct work_task *ptask) { struct batch_request *preq; job_array *pa; int i; char log_buf[LOCAL_LOG_BUF_SIZE]; int num_jobs = 0; int num_prerun = 0; job *pjob; preq = get_remove_batch_request((char *)ptask->wt_parm1); free(ptask->wt_mutex); free(ptask); if (preq == NULL) return; pa = get_array(preq->rq_ind.rq_delete.rq_objname); if (pa == NULL) { /* jobs must have exited already */ reply_ack(preq); return; } for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { num_jobs++; if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) { num_prerun++; /* mom still hasn't gotten job?? delete anyway */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) { /* job has restart file at mom, do end job processing */ change_restart_comment_if_needed(pjob); svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE); pjob->ji_momhandle = -1; /* force new connection */ if (LOGLEVEL >= 7) { sprintf(log_buf, "calling on_job_exit from %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0) { /* job has staged-in file, should remove them */ remove_stagein(&pjob); if (pjob != NULL) { /* job_abt() calls svr_job_purge which will try to lock the array again */ pthread_mutex_unlock(pa->ai_mutex); job_abt(&pjob, NULL); pthread_mutex_lock(pa->ai_mutex); } } else { /* job_abt() calls svr_job_purge which will try to lock the array again */ pthread_mutex_unlock(pa->ai_mutex); job_abt(&pjob, NULL); pthread_mutex_lock(pa->ai_mutex); } } /* END if (ji_substate == JOB_SUBSTATE_PRERUN) */ } /* END for each job in array */ pthread_mutex_unlock(pa->ai_mutex); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s: unlocked ai_mutex", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } if (num_jobs == num_prerun) { reply_ack(preq); } else { req_deletearray(preq); } } /* END array_delete_wt() */
static void post_modify_req( struct work_task *pwt) { struct batch_request *preq; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID)) { sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_modify.rq_objname, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if (preq->rq_reply.brp_code == PBSE_UNKJOBID) { if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL) { req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); return; } else { if (LOGLEVEL >= 0) { sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s", (pjob->ji_qs.ji_jobid != NULL) ? pjob->ji_qs.ji_jobid : "", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate], pjob->ji_qs.ji_destin); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } reply_ack(preq); } return; } /* END post_modify_req() */
void delay_and_send_sig_kill(batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } free_br(preq_sig); /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else if (pjob == NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return; } unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); }
static void process_hold_reply( struct work_task *pwt) { job *pjob; pbs_attribute temphold; struct batch_request *preq; int newstate; int newsub; int rc; char *pset; char log_buf[LOCAL_LOG_BUF_SIZE]; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq was handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); return; } else if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold); if (rc == 0) { rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } /* END process_hold_reply() */