END_TEST START_TEST(svr_find_job_test) { struct job* result = svr_find_job(NULL,0); fail_unless(result == NULL, "NULL job id input fail"); result = svr_find_job((char *)"",0); fail_unless(result == NULL, "empty job id input fail"); }
void *single_delete_work( void *vp) { int rc = -1; batch_request *preq = (batch_request *)vp; char *jobid = preq->rq_ind.rq_delete.rq_objname; job *pjob; char *Msg = preq->rq_extend; // TRUE is the same for non-heterogeneous jobs as FALSE. For heterogeneous // jobs simply delete one to trigger the other being deleted as well. pjob = svr_find_job(jobid, TRUE); if (pjob == NULL) { req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "job unexpectedly deleted"); } else { /* mutex is freed below */ if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE) rc = execute_job_delete(pjob, Msg, preq); if ((rc == PBSE_NONE) || (rc == PURGE_SUCCESS)) reply_ack(preq); } return(NULL); } /* END single_delete_work() */
int release_whole_array( job_array *pa, /* I/0 */ struct batch_request *preq) /* I */ { int i; int rc; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { mutex_mgr job_mutex(pjob->ji_mutex, true); if ((rc = release_job(preq, pjob, pa)) != 0) return(rc); } } /* SUCCESS */ return(PBSE_NONE); } /* END release_whole_array */
job *chk_job_request( char *jobid, /* I */ struct batch_request *preq) /* I */ { job *pjob = NULL; if ((pjob = svr_find_job(jobid, FALSE)) == NULL) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, jobid, pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job"); return(NULL); } /* if we aren't authorized, pjob will be set to NULL in chk_job_req_permissions */ chk_job_req_permissions(&pjob,preq); return(pjob); } /* END chk_job_request() */
pbs_queue *lock_queue_with_job_held( pbs_queue *pque, job **pjob_ptr) { char jobid[PBS_MAXSVRJOBID + 1]; job *pjob = *pjob_ptr; if (pque != NULL) { if (LOGLEVEL >= 10 ) log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pque->qu_qs.qu_name); if (pthread_mutex_trylock(pque->qu_mutex)) { /* if fail */ strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); lock_queue(pque, __func__, NULL, LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) == NULL) { unlock_queue(pque, __func__, NULL, 0); pque = NULL; *pjob_ptr = NULL; } } } return(pque); } /* END lock_queue_with_job_held() */
pbs_queue *lock_queue_with_job_held( pbs_queue *pque, job **pjob_ptr) { char jobid[PBS_MAXSVRJOBID]; job *pjob = *pjob_ptr; if (pque != NULL) { if (pthread_mutex_trylock(pque->qu_mutex)) { /* if fail */ strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); lock_queue(pque, __func__, NULL, LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) == NULL) { unlock_queue(pque, __func__, NULL, 0); pque = NULL; *pjob_ptr = NULL; } } } return(pque); } /* END get_jobs_queue() */
/* * check_exiting_jobs() * * loops over the recorded exiting job information and retries * jobs that have been stale long enough. */ int check_exiting_jobs() { exiting_jobs_info_iterator *iter = NULL; char *jobid; job *pjob; while ((jobid = get_next_retryable_jobid(&iter)) != NULL) { if ((pjob = svr_find_job(jobid, TRUE)) == NULL) { remove_from_exiting_list_by_jobid(jobid); free(jobid); } else { mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true); if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE) { remove_from_exiting_list_by_jobid(jobid); free(jobid); } else { pjob_mutex.unlock(); /* jobid is freed in on_job_exit() */ retry_job_exit(jobid); } } } /* END loop over exiting job information */ return(PBSE_NONE); } /* END check_exiting_jobs() */
int record_reservation( struct pbsnode *pnode, char *rsv_id) { struct pbssubn *sub_node; job *pjob; int found_job = FALSE; for (sub_node = pnode->nd_psn; sub_node != NULL; sub_node = sub_node->next) { if (sub_node->jobs != NULL) { if ((pjob = svr_find_job(sub_node->jobs->jobid, TRUE)) != NULL) { pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id); pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET; create_alps_reservation(pjob); found_job = TRUE; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); break; } } } if (found_job == FALSE) return(-1); return(PBSE_NONE); } /* END record_reservation() */
int retry_job_exit( job_exiting_retry_info *jeri) { char log_buf[LOCAL_LOG_BUF_SIZE]; job *pjob; jeri->attempts++; if (jeri->attempts >= MAX_EXITING_RETRY_ATTEMPTS) { /* job has been attempted the maximum number of times. Destroy the job */ if ((pjob = svr_find_job(jeri->jobid, TRUE)) != NULL) { force_purge_work(pjob); } remove_entry_from_exiting_list(jeri); } else { snprintf(log_buf, sizeof(log_buf), "Retrying job exiting for job %s", jeri->jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); jeri->last_attempt = time(NULL); on_job_exit(NULL, strdup(jeri->jobid)); } return(PBSE_NONE); } /* END retry_job_exit() */
int handle_single_delete( struct batch_request *preq, struct batch_request *preq_tmp, char *Msg) { char *jobid = preq->rq_ind.rq_delete.rq_objname; job *pjob = svr_find_job(jobid, FALSE); if (pjob == NULL) { log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID)); req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job"); } else { unlock_ji_mutex(pjob, __func__, NULL, 0); /* send the asynchronous reply if needed */ if (preq_tmp != NULL) { reply_ack(preq_tmp); preq->rq_noreply = TRUE; /* set for no more replies */ enqueue_threadpool_request(single_delete_work, preq); } else single_delete_work(preq); } return(PBSE_NONE); } /* END handle_single_delete() */
job *get_next_status_job( struct stat_cntl *cntl, int &job_array_index, job_array *pa, all_jobs_iterator *iter) { job *pjob = NULL; if (cntl->sc_type == tjstQueue) pjob = next_job(cntl->sc_pque->qu_jobs,iter); else if (cntl->sc_type == tjstSummarizeArraysQueue) pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter); else if (cntl->sc_type == tjstSummarizeArraysServer) pjob = next_job(&array_summary,iter); else if (cntl->sc_type == tjstArray) { /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size) { if (pa->job_ids[job_array_index] != NULL) { if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL) { break; } } } } else pjob = next_job(&alljobs, iter); return(pjob); } // END get_next_status_job()
/* * send_sig_kill * * The SIGTERM has been sent and we've waited for the kill_delay so now send the SIGKILL. * @pre-cond: pwt must point to a valid task * @pre-cond: pwt->wt_parm1 must point to a valid character string * */ void send_sig_kill( struct work_task *pwt) { job *pjob; char *job_id = (char *)pwt->wt_parm1; static const char *rerun = "rerun"; free(pwt->wt_mutex); free(pwt); if (job_id == NULL) return; if ((pjob = svr_find_job(job_id, FALSE)) == NULL) { free(job_id); return; } char *extra = strdup(rerun); free(job_id); if (issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL) == 0) { pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN; pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE | JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN; } unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } /* END send_sig_kill() */
void post_rerun( batch_request *preq) { int newstate; int newsub; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; if (preq->rq_reply.brp_code != 0) { sprintf(log_buf, "rerun signal reject by mom: %s - %d", preq->rq_ind.rq_signal.rq_jid, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,__func__,log_buf); if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE))) { mutex_mgr job_mutex(pjob->ji_mutex, true); svr_evaljobstate(*pjob, newstate, newsub, 1); svr_setjobstate(pjob, newstate, newsub, FALSE); } } return; } /* END post_rerun() */
int release_whole_array( job_array *pa, /* I/0 */ struct batch_request *preq) /* I */ { int i; int rc; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { if ((rc = release_job(preq, pjob)) != 0) { unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); return(rc); } unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL); } } /* SUCCESS */ return(PBSE_NONE); } /* END release_whole_array */
void *single_delete_work( void *vp) { int rc = -1; batch_request *preq = (batch_request *)vp; char *jobid = preq->rq_ind.rq_delete.rq_objname; job *pjob; char *Msg = preq->rq_extend; pjob = svr_find_job(jobid, FALSE); if (pjob == NULL) { req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "job unexpectedly deleted"); } else { /* mutex is freed below */ if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE) rc = execute_job_delete(pjob, Msg, preq); if ((rc == PBSE_NONE) || (rc == PURGE_SUCCESS)) reply_ack(preq); } return(NULL); } /* END single_delete_work() */
int check_exiting_jobs() { int iter = -1; job_exiting_retry_info *jeri; job *pjob; time_t time_now = time(NULL); while ((jeri = (job_exiting_retry_info *)next_from_hash_map(exiting_jobs_info, &iter)) != NULL) { if (time_now - jeri->last_attempt > EXITING_RETRY_TIME) { if ((pjob = svr_find_job(jeri->jobid, TRUE)) == NULL) { remove_entry_from_exiting_list(jeri); } else { if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE) { remove_entry_from_exiting_list(jeri); unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); } else { unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); retry_job_exit(jeri); } } } } return(PBSE_NONE); } /* END check_exiting_jobs() */
int is_orphaned( char *rsv_id) { int index; int orphaned = FALSE; job *pjob; alps_reservation *ar = NULL; pthread_mutex_lock(alps_reservations.rh_mutex); index = get_value_hash(alps_reservations.rh_ht, rsv_id); if (index != -1) ar = (alps_reservation *)alps_reservations.rh_alps_rsvs->slots[index].item; pthread_mutex_unlock(alps_reservations.rh_mutex); if (ar != NULL) { if ((pjob = svr_find_job(ar->job_id, TRUE)) != NULL) { if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE) orphaned = TRUE; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } else orphaned = TRUE; } else orphaned = TRUE; return(orphaned); } /* END is_orphaned() */
void process_checkpoint_reply( batch_request *preq) { job *pjob; /* preq handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_manager.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { mutex_mgr job_mutex = mutex_mgr(pjob->ji_mutex, true); /* record that MOM has a checkpoint file */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */ reply_ack(preq); } } /* END process_checkpoint_reply() */
static void job_delete_nanny( struct work_task *pwt) { job *pjob; char *sigk = "SIGKILL"; char *jobid; struct batch_request *newreq; char log_buf[LOCAL_LOG_BUF_SIZE]; time_t time_now = time(NULL); long nanny = FALSE; /* short-circuit if nanny isn't enabled */ get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { jobid = (char *)pwt->wt_parm1; if (jobid != NULL) { pjob = svr_find_job(jobid, FALSE); if (pjob != NULL) { sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid); log_err(-1, "job nanny", log_buf); /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL) { strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk); } issue_signal(&pjob, sigk, post_job_delete_nanny, newreq); if (pjob != NULL) { apply_job_delete_nanny(pjob, time_now + 60); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } } else { log_err(ENOMEM, __func__, "Cannot allocate memory"); } } if (pwt->wt_parm1 != NULL) free(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); } /* END job_delete_nanny() */
void post_job_delete_nanny( batch_request *preq_sig) { int rc; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; long nanny = 0; if (preq_sig == NULL) return; rc = preq_sig->rq_reply.brp_code; get_svr_attr_l(SRV_ATR_JobNanny, &nanny); if (!nanny) { /* the admin disabled nanny within the last minute or so */ free_br(preq_sig); return; } /* extract job id from task */ pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE); if (pjob == NULL) { sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); } else if (rc == PBSE_UNKJOBID) { sprintf(log_buf, "job delete nanny returned, but does not exist on mom"); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf); free_nodes(pjob); set_resc_assigned(pjob, DECR); free_br(preq_sig); svr_job_purge(pjob); return; } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); /* free task */ free_br(preq_sig); return; } /* END post_job_delete_nanny() */
job *svr_find_job_by_id( int internal_job_id) { const char *job_id = job_mapper.get_name(internal_job_id); return(svr_find_job(job_id, TRUE)); }
int modify_whole_array( job_array *pa, /* I/O */ svrattrl *plist, /* I */ struct batch_request *preq, /* I */ int checkpoint_req) /* I */ { int i; int rc = PBSE_NONE; int modify_job_rc = PBSE_NONE; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */ batch_request *array_req = duplicate_request(preq, i); mutex_mgr job_mutex(pjob->ji_mutex, true); pthread_mutex_unlock(pa->ai_mutex); array_req->rq_noreply = TRUE; rc = modify_job((void **)&pjob, plist, array_req, checkpoint_req, NO_MOM_RELAY); if (rc != PBSE_NONE) { modify_job_rc = rc; } pa = get_jobs_array(&pjob); if (pa == NULL) { if (pjob == NULL) job_mutex.set_lock_on_exit(false); return(PBSE_JOB_RECYCLED); } if (pjob == NULL) { pa->job_ids[i] = NULL; job_mutex.set_lock_on_exit(false); continue; } } } /* END foreach job in array */ return(modify_job_rc); } /* END modify_whole_array() */
void post_modify_req( batch_request *preq) { job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID)) { sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_modify.rq_objname, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if (preq->rq_reply.brp_code == PBSE_UNKJOBID) { if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL) { req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); return; } else { mutex_mgr job_mutex(pjob->ji_mutex, true); if (LOGLEVEL >= 0) { sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s", pjob->ji_qs.ji_jobid, PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate], pjob->ji_qs.ji_destin); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } } } reply_ack(preq); } return; } /* END post_modify_req() */
/* * record_reservation() * * @pre-cond: pnode and rsv_id must be valid pointers * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism * and on the job which has the node reserved, or -1 is returned and the reservation * is not recorded. * @param - pnode the node which is reporting the reservation * @param - rsv_id the id of the reservation being reported * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise */ int record_reservation( struct pbsnode *pnode, const char *rsv_id) { job *pjob; bool found_job = false; char jobid[PBS_MAXSVRJOBID + 1]; for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++) { /* cray only allows one job per node, so any valid job will be the job that is * reserving this node. */ job_usage_info *jui = pnode->nd_job_usages[i]; strcpy(jobid, jui->jobid); unlock_node(pnode, __func__, NULL, LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id); pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET; /* add environment variable BATCH_PARTITION_ID */ char buf[1024]; snprintf(buf, sizeof(buf), "BATCH_PARTITION_ID=%s", rsv_id); pbs_attribute tempattr; clear_attr(&tempattr, &job_attr_def[JOB_ATR_variables]); job_attr_def[JOB_ATR_variables].at_decode(&tempattr, NULL, NULL, buf, 0); job_attr_def[JOB_ATR_variables].at_set( &pjob->ji_wattr[JOB_ATR_variables], &tempattr, INCR); job_attr_def[JOB_ATR_variables].at_free(&tempattr); track_alps_reservation(pjob); found_job = true; job_mutex.unlock(); lock_node(pnode, __func__, NULL, LOGLEVEL); break; } else lock_node(pnode, __func__, NULL, LOGLEVEL); } if (found_job == false) return(-1); return(PBSE_NONE); } /* END record_reservation() */
void finish_move_process( char *job_id, batch_request *preq, long time, const char *node_name, int status, int type, int mom_err) { char log_buf[LOCAL_LOG_BUF_SIZE+1]; job *pjob = svr_find_job(job_id, TRUE); if (pjob == NULL) { /* somehow the job has been deleted mid-runjob */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job %s was deleted while servicing move request", job_id); if (preq != NULL) { if (mom_err != PBSE_NONE) req_reject(mom_err, 0, preq, node_name, log_buf); else req_reject(PBSE_JOBNOTFOUND, 0, preq, node_name, log_buf); } } else { mutex_mgr job_mutex(pjob->ji_mutex, true); switch (type) { case MOVE_TYPE_Move: finish_moving_processing(pjob, preq, status); break; case MOVE_TYPE_Route: finish_routing_processing(pjob, status); break; case MOVE_TYPE_Exec: job_mutex.unlock(); finish_sendmom(job_id, preq, time, node_name, status, mom_err); break; } /* END switch (type) */ } } /* END finish_move_process() */
int remove_job_from_exiting_list( job **pjob) { std::string jobid((*pjob)->ji_qs.ji_jobid); unlock_ji_mutex(*pjob,__func__, NULL, LOGLEVEL); int rc = remove_from_exiting_list_by_jobid(jobid.c_str()); *pjob = svr_find_job((char *)jobid.c_str(),FALSE); return rc; } /* END remove_job_from_exiting_list() */
/** * poll _job_task * * The invocation of this routine is triggered from * the pbs_server main_loop code. The check of * SRV_ATR_PollJobs appears to be redundant. */ void poll_job_task( struct work_task *ptask) { char *job_id = (char *)ptask->wt_parm1; job *pjob; time_t time_now = time(NULL); long poll_jobs = 0; int job_state = -1; if (job_id != NULL) { pjob = svr_find_job(job_id, FALSE); if (pjob != NULL) { mutex_mgr job_mutex(pjob->ji_mutex, true); job_state = pjob->ji_qs.ji_state; job_mutex.unlock(); get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs); if ((poll_jobs) && (job_state == JOB_STATE_RUNNING)) { /* we need to throttle the number of outstanding threads are doing job polling. This prevents a problem where pbs_server gets hung waiting on I/O from the mom */ pthread_mutex_lock(poll_job_task_mutex); if (current_poll_job_tasks < max_poll_job_tasks) { current_poll_job_tasks++; pthread_mutex_unlock(poll_job_task_mutex); stat_mom_job(job_id); pthread_mutex_lock(poll_job_task_mutex); current_poll_job_tasks--; } pthread_mutex_unlock(poll_job_task_mutex); /* add another task */ set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE); } } free(job_id); } free(ptask->wt_mutex); free(ptask); } /* END poll_job_task() */
/* * delete_whole_array() * * iterates over the array and deletes the whole thing * @param pa - the array to be deleted * @return - the number of jobs skipped */ int delete_whole_array( job_array *pa) /* I */ { int i; int num_skipped = 0; int num_jobs = 0; int deleted; job *pjob; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] == NULL) continue; if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL) { free(pa->job_ids[i]); pa->job_ids[i] = NULL; } else { num_jobs++; if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING) { /* invalid state for request, skip */ unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } pthread_mutex_unlock(pa->ai_mutex); deleted = attempt_delete(pjob); if (deleted == FALSE) { /* if the job was deleted, this mutex would be taked care of elsewhere. * When it fails, release it here */ unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); num_skipped++; } pthread_mutex_lock(pa->ai_mutex); } } if (num_jobs == 0) return(NO_JOBS_IN_ARRAY); return(num_skipped); } /* END delete_whole_array() */
char *get_next_retryable_jobid( exiting_jobs_info_iterator **iter) { job_exiting_retry_info *jeri; job *pjob; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; exiting_jobs_info.lock(); if(*iter == NULL) { *iter = exiting_jobs_info.get_iterator(); } while ((jeri = (*iter)->get_next_item()) != NULL) { if (time_now - jeri->last_attempt > EXITING_RETRY_TIME) { if (jeri->attempts >= MAX_EXITING_RETRY_ATTEMPTS) { std::string jid(jeri->jobid); exiting_jobs_info.remove(jeri->jobid); free(jeri); exiting_jobs_info.unlock(); if ((pjob = svr_find_job((char *)jid.c_str(), TRUE)) != NULL) { snprintf(log_buf, sizeof(log_buf), "Job %s has had its exiting re-tried %d times, purging.", jeri->jobid, MAX_EXITING_RETRY_ATTEMPTS); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); force_purge_work(pjob); } exiting_jobs_info.lock(); } else { jeri->attempts++; jeri->last_attempt = time_now; exiting_jobs_info.unlock(); char *jobid = strdup(jeri->jobid); return(jobid); } } } exiting_jobs_info.unlock(); return(NULL); } /* END get_next_retryable_jobid() */
int issue_signal( job **pjob_ptr, char *signame, /* name of the signal to send */ void (*func)(batch_request *), void *extra) /* extra parameter to be stored in sig request */ { int rc; job *pjob = *pjob_ptr; struct batch_request *newreq; char jobid[PBS_MAXSVRJOBID + 1]; /* build up a Signal Job batch request */ if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL) { /* FAILURE */ return(PBSE_SYSTEM); } newreq->rq_extra = extra; strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid); snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame); /* The newreq is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ rc = relay_to_mom(&pjob, newreq, NULL); if ((rc == PBSE_NONE) && (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); unlock_ji_mutex(pjob, __func__, NULL, 0); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); } else { free_br(newreq); if (pjob == NULL) *pjob_ptr = NULL; } return(rc); } /* END issue_signal() */