static void post_chkpt(struct work_task *ptask) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (!preq || !pjob) return; if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if (preq->rq_reply.brp_auxcode) { /* chkpt can be moved */ pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_ChkptMig; pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); } account_record(PBS_ACCT_CHKPNT, pjob, (char *)0); } else { /* need to try rerun if possible or just abort the job */ if (preq->rq_reply.brp_code != PBSE_CKPBSY) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; pjob->ji_modified = 1; (void)job_save(pjob, SAVEJOB_QUICK); if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(pjob, msg_on_shutdown); } } release_req(ptask); }
void post_checkpoint( batch_request *preq) { job *pjob; if (preq == NULL) return; pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE); if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if ((preq->rq_reply.brp_auxcode) && (pjob != NULL)) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } } else { /* need to try rerun if possible or just abort the job */ if (pjob) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(&pjob, msg_on_shutdown); } } free_br(preq); if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } /* END post_checkpoint() */
static void post_checkpoint( struct work_task *ptask) { job *pjob; struct batch_request *preq; preq = (struct batch_request *)ptask->wt_parm1; pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (preq->rq_reply.brp_code == 0) { /* checkpointed ok */ if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } } else { /* need to try rerun if possible or just abort the job */ if (pjob) { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) rerun_or_kill(pjob, msg_on_shutdown); } } release_req(ptask); } /* END post_checkpoint() */
void svr_shutdown(int type) { attribute *pattr; job *pjob; job *pnxt; long *state; int wait_for_secondary = 0; /* Lets start by logging shutdown and saving everything */ state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long; (void)strcpy(log_buffer, msg_shutdown_start); if (*state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { *state = SV_STATE_DOWN; (void)strcat(log_buffer, "Forced"); log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer); return; } } /* in failover environments, need to communicate with Secondary */ /* and for these two where the Primary is going down, mark to */ /* wait for the acknowledgement from the Secondary */ if (type & SHUT_WHO_SECDRY) { if (failover_send_shutdown(FAILOVER_SecdShutdown) == 0) wait_for_secondary = 1; } else if (type & SHUT_WHO_IDLESECDRY) { if (failover_send_shutdown(FAILOVER_SecdGoInactive) == 0) wait_for_secondary = 1; } /* what is the manner of our demise? */ type = type & SHUT_MASK; if (type == SHUT_IMMEDIATE) { *state = SV_STATE_SHUTIMM; (void)strcat(log_buffer, "Immediate"); } else if (type == SHUT_DELAY) { *state = SV_STATE_SHUTDEL; (void)strcat(log_buffer, "Delayed"); } else if (type == SHUT_QUICK) { *state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ (void)strcat(log_buffer, "Quick"); } else { *state = SV_STATE_DOWN; (void)strcat(log_buffer, "By Signal"); type = SHUT_QUICK; } log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer); if (wait_for_secondary) *state |= SV_STATE_PRIMDLY; /* wait for reply from Secondary */ if (type == SHUT_QUICK) /* quick, leave jobs as are */ return; svr_save_db(&server, SVR_SAVE_QUICK); pnxt = (job *)GET_NEXT(svr_alljobs); while ((pjob = pnxt) != (job *)0) { pnxt = (job *)GET_NEXT(pjob->ji_alljobs); if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt]; if ((pattr->at_val.at_str) && (*pattr->at_val.at_str != 'n')) { /* do checkpoint of job */ if (shutdown_chkpt(pjob) == 0) continue; } /* if not checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(pjob, msg_on_shutdown); } } return; }
void svr_shutdown( int type) /* I */ { attribute *pattr; job *pjob; job *pnxt; long *state; /* Lets start by logging shutdown and saving everything */ state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long; strcpy(log_buffer, msg_shutdown_start); if (*state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { *state = SV_STATE_DOWN; strcat(log_buffer, "Forced"); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); return; } } if (type == SHUT_IMMEDIATE) { *state = SV_STATE_SHUTIMM; strcat(log_buffer, "Immediate"); } else if (type == SHUT_DELAY) { *state = SV_STATE_SHUTDEL; strcat(log_buffer, "Delayed"); } else if (type == SHUT_QUICK) { *state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ strcat(log_buffer, "Quick"); } else { *state = SV_STATE_SHUTIMM; strcat(log_buffer, "By Signal"); } log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */ { return; } svr_save(&server, SVR_SAVE_QUICK); pnxt = (job *)GET_NEXT(svr_alljobs); while ((pjob = pnxt) != NULL) { pnxt = (job *)GET_NEXT(pjob->ji_alljobs); if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint]; if ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL))) { /* do checkpoint of job */ if (shutdown_checkpoint(pjob) == 0) continue; } /* if no checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(pjob, msg_on_shutdown); } } return; } /* END svr_shutdown() */
void svr_shutdown( int type) /* I */ { pbs_attribute *pattr; job *pjob; long state = SV_STATE_DOWN; int iter; char log_buf[LOCAL_LOG_BUF_SIZE]; close(lockfds); save_queues(); /* Lets start by logging shutdown and saving everything */ get_svr_attr_l(SRV_ATR_State, &state); strcpy(log_buf, msg_shutdown_start); if (state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { state = SV_STATE_DOWN; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Forced"); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); return; } } if (type == SHUT_IMMEDIATE) { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Immediate"); } else if (type == SHUT_DELAY) { state = SV_STATE_SHUTDEL; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Delayed"); } else if (type == SHUT_QUICK) { state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Quick"); } else { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "By Signal"); } log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */ { return; } svr_save(&server, SVR_SAVE_QUICK); iter = -1; while ((pjob = next_job(&alljobs,&iter)) != NULL) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL))) { /* do checkpoint of job */ if (shutdown_checkpoint(&pjob) == 0) { if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } } /* if no checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(&pjob, msg_on_shutdown); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } return; } /* END svr_shutdown() */