END_TEST START_TEST(svr_job_purge_test) { struct job* test_job = NULL; int result = svr_job_purge(test_job); fail_unless(result != 0, "NULL job input fail"); test_job = job_alloc(); svr_job_purge(test_job); fail_unless(result >= -1, "empty job input fail: %d", result);/*TODO: fix -1 via log_job_record mock*/ }
void force_purge_work( job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; snprintf(log_buf, sizeof(log_buf), "purging job %s without checking MOM", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); free_nodes(pjob); if ((pque = get_jobs_queue(&pjob)) != NULL) { if (pjob->ji_qhdr->qu_qs.qu_type == QTYPE_Execution) { unlock_queue(pque, __func__, NULL, LOGLEVEL); set_resc_assigned(pjob, DECR); } else unlock_queue(pque, __func__, NULL, LOGLEVEL); } if (pjob != NULL) svr_job_purge(pjob); } /* END force_purge_work() */
void post_job_delete_nanny( batch_request *preq_sig) { int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
void force_purge_work( job *pjob) { char log_buf[LOCAL_LOG_BUF_SIZE]; pbs_queue *pque; snprintf(log_buf, sizeof(log_buf), "purging job %s without checking MOM", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); free_nodes(pjob); if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); if (pque->qu_qs.qu_type == QTYPE_Execution) { pque_mutex.unlock(); set_resc_assigned(pjob, DECR); } } depend_on_term(pjob); svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE); if (pjob != NULL) { if (is_ms_on_server(pjob)) { char log_buf[LOCAL_LOG_BUF_SIZE]; if (LOGLEVEL >= 7) { snprintf(log_buf, sizeof(log_buf), "Mother Superior is on the server, not cleaning spool files in %s", __func__); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } svr_job_purge(pjob, 1); } else svr_job_purge(pjob); } } /* END force_purge_work() */
int close_quejob_by_jobid( char *job_id) { int rc = PBSE_NONE; job *pjob = NULL; if (LOGLEVEL >= 10) { LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, job_id); } if ((pjob = svr_find_job(job_id, FALSE)) == NULL) { rc = PBSE_JOBNOTFOUND; return(rc); } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { remove_job(&newjobs,pjob); svr_job_purge(pjob); pjob = NULL; } else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) { remove_job(&newjobs,pjob); pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; rc = svr_enquejob(pjob, FALSE, -1, false); if ((rc == PBSE_JOBNOTFOUND) || (rc == PBSE_JOB_RECYCLED)) { pjob = NULL; } else if (rc != PBSE_NONE) { job_abt(&pjob, msg_err_noqueue); pjob = NULL; } } if (pjob == NULL) pjob_mutex.set_lock_on_exit(false); return(rc); } /* close_quejob_by_jobid() */
END_TEST START_TEST(svr_job_purge_test) { struct job* test_job = NULL; int result = svr_job_purge(test_job); fail_unless(result != 0, "NULL job input fail"); called_remove_job = 0; dequejob_rc = PBSE_JOB_NOT_IN_QUEUE; test_job = job_alloc(); test_job->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; test_job->ji_qs.ji_state = JOB_STATE_QUEUED; result = svr_job_purge(test_job); fail_unless(result == 0, "non-queued job fail", result); // called_remove_job once means we didn't call job_free fail_unless(called_remove_job == 1); dequejob_rc = 0; result = svr_job_purge(test_job); fail_unless(result == 0, "queued job fail: %d", result); // Calling remove_job twice means we did call job_free fail_unless(called_remove_job == 3); }
static void post_delete_mom1( struct work_task *pwt) { int delay = 0; int dellen = strlen(deldelaystr); job *pjob; pbs_queue *pque; char *preq_clt_id; struct batch_request *preq_sig; /* signal request to MOM */ struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); preq_sig = get_remove_batch_request((char *)pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; preq_clt_id = preq_sig->rq_extra; free_br(preq_sig); if (preq_clt_id != NULL) { preq_clt = get_remove_batch_request(preq_clt_id); free(preq_clt_id); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE); if (pjob == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during delete"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); svr_job_purge(pjob); reply_ack(preq_clt); } else { req_reject(rc, 0, preq_clt, NULL, NULL); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } return; } if (preq_clt->rq_extend) { if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0) { delay = atoi(preq_clt->rq_extend + dellen); } } reply_ack(preq_clt); /* dont need it, reply now */ /* * if no delay specified in original request, see if kill_delay * queue attribute is set. */ if (delay == 0) { if ((pque = get_jobs_queue(&pjob)) != NULL) { pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 2); pthread_mutex_unlock(server.sv_attr_mutex); unlock_queue(pque, __func__, NULL, LOGLEVEL); } else if (pjob != NULL) return; } set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE); /* * Since the first signal has succeeded, let's reschedule the * nanny to be 1 minute after the second phase. */ apply_job_delete_nanny(pjob, time_now + delay + 60); unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } /* END post_delete_mom1() */
void delay_and_send_sig_kill( batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away, chk_job_request() calls req_reject() on failure */ return; } mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { pjob_mutex.unlock(); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } // Apply the user delay first so it takes precedence. if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET) delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long; if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false); if (delay == 0) { delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); } } else { /* why is the pque null. Something went wrong */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid); req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf); return; } pjob_mutex.unlock(); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); } // END delay_and_send_sig_kill()
void finish_moving_processing( job *pjob, struct batch_request *req, int status) { char log_buf[LOCAL_LOG_BUF_SIZE]; int newstate; int newsub; if (req->rq_type != PBS_BATCH_MoveJob) { sprintf(log_buf, "bad request type %d\n", req->rq_type); log_err(-1, __func__, log_buf); return; } if (pjob == NULL) return; switch (status) { case LOCUTION_SUCCESS: /* purge server's job structure */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); } snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob); snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager, req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host); if (pjob != NULL) svr_job_purge(pjob); reply_ack(req); break; default: status = PBSE_ROUTEREJ; if (pjob != NULL) { /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); } req_reject(status, 0, req, NULL, NULL); break; } /* END switch (status) */ } /* END finish_moving_processing() */
void finish_routing_processing( job *pjob, int status) { int newstate; int newsub; if (pjob == NULL) return; if (LOGLEVEL >= 10) log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid); switch (status) { case LOCUTION_SUCCESS: /* normal return, job was routed */ if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) remove_stagein(&pjob); if (pjob != NULL) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED) remove_checkpoint(&pjob); if (pjob != NULL) svr_job_purge(pjob); /* need to remove server job struct */ } break; case LOCUTION_FAIL: /* permanent rejection (or signal) */ if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) { /* job delete in progress, just set to queued status */ svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE); svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); return; } add_dest(pjob); /* else mark destination as bad */ /* fall through */ default: /* try routing again */ svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server"); /* force re-eval of job state out of Transit */ svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); if ((status = job_route(pjob)) == PBSE_ROUTEREJ) job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ)); else if (status != 0) job_abt(&pjob, msg_routexceed); else unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); break; } /* END switch (status) */ return; } /* END finish_routing_processing() */
int setup_array_struct( job *pjob) { job_array *pa; array_request_node *rn; int bad_token_count; int array_size; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; long max_array_size; pa = (job_array *)calloc(1,sizeof(job_array)); pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid); strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix); snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s", get_variable(pjob, pbs_o_host)); pa->ai_qs.num_cloned = 0; CLEAR_HEAD(pa->request_tokens); pa->ai_mutex = calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(pa->ai_mutex,NULL); lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); if (job_save(pjob, SAVEJOB_FULL, 0) != 0) { /* the array is deleted in svr_job_purge */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); svr_job_purge(pjob); /* Does job array need to be removed? */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", "cannot save job"); } return(1); } if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa))) { long max_limit = 0; get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit); array_delete(pa); snprintf(log_buf,sizeof(log_buf), "Array %s requested a slot limit above the max limit %ld, rejecting\n", pa->ai_qs.parent_id, max_limit); log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,pa->ai_qs.parent_id,log_buf); return(INVALID_SLOT_LIMIT); } pa->ai_qs.jobs_running = 0; pa->ai_qs.num_started = 0; pa->ai_qs.num_failed = 0; pa->ai_qs.num_successful = 0; bad_token_count = parse_array_request( pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, &(pa->request_tokens)); /* get the number of elements that should be allocated in the array */ rn = (array_request_node *)GET_NEXT(pa->request_tokens); array_size = 0; pa->ai_qs.num_jobs = 0; while (rn != NULL) { if (rn->end > array_size) array_size = rn->end; /* calculate the actual number of jobs (different from array size) */ pa->ai_qs.num_jobs += rn->end - rn->start + 1; rn = (array_request_node *)GET_NEXT(rn->request_tokens_link); } /* size of array is the biggest index + 1 */ array_size++; if (get_svr_attr_l(SRV_ATR_MaxArraySize, &max_array_size) == PBSE_NONE) { if (max_array_size < pa->ai_qs.num_jobs) { array_delete(pa); return(ARRAY_TOO_LARGE); } } /* initialize the array */ pa->job_ids = calloc(array_size, sizeof(char *)); if (pa->job_ids == NULL) { sprintf(log_buf, "Failed to alloc job_ids: job %s", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); return(PBSE_MEM_MALLOC); } /* remember array_size */ pa->ai_qs.array_size = array_size; CLEAR_HEAD(pa->ai_qs.deps); array_save(pa); if (bad_token_count > 0) { array_delete(pa); return 2; } pjob->ji_arraystruct = pa; insert_array(pa); unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); return(PBSE_NONE); } /* END setup_array_struct() */
/* delete a job array struct from memory and disk. This is used when the number * of jobs that belong to the array becomes zero. * returns zero if there are no errors, non-zero otherwise */ int array_delete( job_array *pa) { int i; char path[MAXPATHLEN + 1]; char log_buf[LOCAL_LOG_BUF_SIZE]; array_request_node *rn; struct array_depend *pdep; struct array_depend_job *pdj; /* first thing to do is take this out of the servers list of all arrays */ remove_array(pa); /* unlock the mutex and free it */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); free(pa->ai_mutex); /* delete the on disk copy of the struct */ snprintf(path, sizeof(path), "%s%s%s", path_arrays, pa->ai_qs.fileprefix, ARRAY_FILE_SUFFIX); if (unlink(path)) { sprintf(log_buf, "unable to delete %s", path); log_err(errno, "array_delete", log_buf); } /* clear array request linked list */ for (rn = (array_request_node *)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node *)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } /* free the memory for the job pointers */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) free(pa->job_ids[i]); } free(pa->job_ids); /* free the dependencies, if any */ for (pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps); pdep != NULL; pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps)) { delete_link(&pdep->dp_link); for (pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs); pdj != NULL; pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs)) { delete_link(&pdj->dc_link); free(pdj); } free(pdep); } /* purge the "template" job, this also deletes the shared script file for the array*/ if (pa->ai_qs.parent_id[0] != '\0') { job *pjob; if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL) svr_job_purge(pjob); } /* free the memory allocated for the struct */ free(pa); return 0; } /* END array_delete() */
void delay_and_send_sig_kill(batch_request *preq_sig) { int delay = 0; job *pjob; pbs_queue *pque; struct batch_request *preq_clt = NULL; /* original client request */ int rc; time_t time_now = time(NULL); if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; if (preq_sig->rq_extend != NULL) { preq_clt = get_remove_batch_request(preq_sig->rq_extend); } free_br(preq_sig); /* the client request has been handled another way, nothing left to do */ if (preq_clt == NULL) return; if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL) { /* job has gone away */ req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL); return; } if (rc) { /* mom rejected request */ if (rc == PBSE_UNKJOBID) { /* MOM claims no knowledge, so just purge it */ log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "MOM rejected signal during rerun"); /* removed the resources assigned to job */ free_nodes(pjob); set_resc_assigned(pjob, DECR); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); svr_job_purge(pjob); reply_ack(preq_clt); } else { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); req_reject(rc, 0, preq_clt, NULL, NULL); } return; } if ((pque = get_jobs_queue(&pjob)) != NULL) { mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true); pthread_mutex_lock(server.sv_attr_mutex); delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay], &server.sv_attr[SRV_ATR_KillDelay], 0); pthread_mutex_unlock(server.sv_attr_mutex); } else if (pjob == NULL) { unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); return; } unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); reply_ack(preq_clt); set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE); }