static int svr_send_checkpoint( job *pjob, /* I */ struct batch_request *preq, /* I */ int state, /* I */ int substate) /* I */ { struct batch_request *momreq = 0; int rc; momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_IN); if (momreq == NULL) { /* no files to send, go directly to sending job to mom */ return(svr_strtjob2(pjob, preq)); } /* save job id for post_checkpointsend */ momreq->rq_extra = malloc(PBS_MAXSVRJOBID + 1); if (momreq->rq_extra == 0) { return(PBSE_SYSTEM); } strcpy(momreq->rq_extra, pjob->ji_qs.ji_jobid); rc = relay_to_mom( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, momreq, post_checkpointsend); if (rc == 0) { svr_setjobstate(pjob, state, substate); /* * checkpoint copy started ok - reply to client as copy may * take too long to wait. */ if (preq != NULL) reply_ack(preq); } else { free(momreq->rq_extra); } return(rc); } /* END svr_send_checkpoint() */
END_TEST START_TEST(cpy_checkpoint_test) { struct job *test_job = job_alloc(); struct batch_request *result = cpy_checkpoint(NULL, test_job, JOB_ATR_checkpoint_name, CKPT_DIR_IN); struct batch_request *initial = alloc_br(/*PBS_BATCH_CheckpointJob*/0); fail_unless(result == NULL, "NULL batch_request input fail"); result = cpy_checkpoint(initial, NULL, JOB_ATR_checkpoint_name, CKPT_DIR_IN); fail_unless(result == NULL, "NULL job input fail"); /*TODO: add test for valid input, invalid dir value*/ }
int modify_job( void **j, /* O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req, /* I */ int flag) /* I */ { int bad = 0; int i; int newstate; int newsubstate; resource_def *prsd; int rc; int sendmom = 0; int copy_checkpoint_files = FALSE; char log_buf[LOCAL_LOG_BUF_SIZE]; struct batch_request *dup_req = NULL; job *pjob = (job *)*j; if (pjob == NULL) { sprintf(log_buf, "job structure is NULL"); log_err(PBSE_IVALREQ, __func__, log_buf); return(PBSE_IVALREQ); } /* cannot be in exiting or transit, exiting has already been checked */ if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify job '%s' in transit\n", pjob->ji_qs.ji_jobid); log_err(PBSE_BADSTATE, __func__, log_buf); return(PBSE_BADSTATE); } if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) && (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)) { /* May need to request copy of the checkpoint file from mom */ copy_checkpoint_files = TRUE; if (checkpoint_req == CHK_HOLD) { sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid); pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); /* remove checkpoint restart file if there is one */ if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET) { cleanup_restart_file(pjob); } } } /* if job is running, special checks must be made */ /* NOTE: must determine if job exists down at MOM - this will occur if job is running, job is held, or job was held and just barely released (ie qhold/qrls) */ /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS * --FIX and uncomment once we know what is really going on. * * We now know that ji_destin gets set on a qmove and that the mom does not * have the job at that point. * if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) || ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) || ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0'))) */ if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { while (plist != NULL) { /* is the pbs_attribute modifiable in RUN state ? */ i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST); if ((i < 0) || ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return PBSE_MODATRRUN; } /* NOTE: only explicitly specified job attributes are routed down to MOM */ if (i == JOB_ATR_resource) { /* is the specified resource modifiable while */ /* the job is running */ prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size); if (prsd == NULL) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Unknown attribute '%s'\n", plist->al_name); log_err(PBSE_UNKRESC, __func__, log_buf); return(PBSE_UNKRESC); } if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot modify attribute '%s' while running\n", plist->al_name); log_err(PBSE_MODATRRUN, __func__, log_buf); return(PBSE_MODATRRUN); } sendmom = 1; } /* else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables)) { sendmom = 1; } */ plist = (svrattrl *)GET_NEXT(plist->al_link); } } /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */ /* modify the job's attributes */ bad = 0; plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr); rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad); if (rc) { /* FAILURE */ snprintf(log_buf,sizeof(log_buf), "Cannot set attributes for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); if (rc == PBSE_JOBNOTFOUND) *j = NULL; return(rc); } /* Reset any defaults resource limit which might have been unset */ set_resc_deflt(pjob, NULL, FALSE); /* if job is not running, may need to change its state */ if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) { svr_evaljobstate(pjob, &newstate, &newsubstate, 0); svr_setjobstate(pjob, newstate, newsubstate, FALSE); } else { job_save(pjob, SAVEJOB_FULL, 0); } sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); /* if a resource limit changed for a running job, send to MOM */ if (sendmom) { /* if the NO_MOM_RELAY flag is set the calling function will call relay_to_mom so we do not need to do it here */ if (flag != NO_MOM_RELAY) { /* The last number is unused unless this is an array */ if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0) { } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req))) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(rc); /* unable to get to MOM */ } } return(PBSE_RELAYED_TO_MOM); } if (copy_checkpoint_files) { struct batch_request *momreq = 0; momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT); if (momreq != NULL) { /* have files to copy */ momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid); /* The momreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ if (checkpoint_req == CHK_HOLD) { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold); } else { rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done); } if (rc != 0) { if (pjob != NULL) { snprintf(log_buf,sizeof(log_buf), "Unable to relay information to mom for job '%s'\n", pjob->ji_qs.ji_jobid); log_err(rc, __func__, log_buf); } return(PBSE_NONE); /* come back when mom replies */ } } else { log_err(-1, __func__, "Failed to get batch request"); } } return(PBSE_NONE); } /* END modify_job() */