static void post_modify_req( struct work_task *pwt) { struct batch_request *preq; job *pjob; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID)) { sprintf(log_buffer, msg_mombadmodify, preq->rq_reply.brp_code); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_modify.rq_objname, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if (preq->rq_reply.brp_code == PBSE_UNKJOBID) { if ((pjob = find_job(preq->rq_ind.rq_modify.rq_objname)) == NULL) { req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); return; } else { if (LOGLEVEL >= 0) { sprintf(log_buffer, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s", (pjob->ji_qs.ji_jobid != NULL) ? pjob->ji_qs.ji_jobid : "", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate], pjob->ji_qs.ji_destin); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } } } reply_ack(preq); } return; } /* END post_modify_req() */
void release_req(struct work_task *pwt) { free_br((struct batch_request *)pwt->wt_parm1); if (pwt->wt_event != -1 && pwt->wt_aux2 != 1) /* not rpp */ svr_disconnect(pwt->wt_event); }
static void post_message_req( struct work_task *pwt) { struct batch_request *preq; char log_buf[LOCAL_LOG_BUF_SIZE]; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq has been hadnled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ sprintf(log_buf, msg_messagejob, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_message.rq_jid, log_buf); if (preq->rq_reply.brp_code) req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); else reply_ack(preq); } /* END post_message_req() */
static void process_checkpoint_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = find_job(preq->rq_ind.rq_manager.rq_objname)) == (job *)0) { LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_manager.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { /* record that MOM has a checkpoint file */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */ reply_ack(preq); } }
void release_req( struct work_task *pwt) { free_br((struct batch_request *)pwt->wt_parm1); if (pwt->wt_event != -1) svr_disconnect(pwt->wt_event); return; }
void release_req( struct work_task *pwt) { batch_request *preq; char *br_id = pwt->wt_parm1; if ((preq = get_remove_batch_request(br_id)) != NULL) free_br(preq); if (pwt->wt_event != -1) svr_disconnect(pwt->wt_event); free(pwt->wt_mutex); free(pwt); } /* END release_req() */
static void post_message_req(struct work_task *pwt) { struct batch_request *preq; if (pwt->wt_aux2 != 1) /* not rpp */ svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ (void)sprintf(log_buffer, msg_messagejob, preq->rq_reply.brp_code); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_message.rq_jid, log_buffer); if (preq->rq_reply.brp_code) req_reject(preq->rq_reply.brp_code, 0, preq); else reply_ack(preq); }
static void post_py_spawn_req(struct work_task *pwt) { struct batch_request *preq; char tmp_buf[128] = ""; if (pwt->wt_aux2 != 1) /* not rpp */ svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if (preq->rq_reply.brp_code == 0) sprintf(tmp_buf, " exit value %d", preq->rq_reply.brp_auxcode); sprintf(log_buffer, "Python spawn status %d%s", preq->rq_reply.brp_code, tmp_buf); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, preq->rq_ind.rq_py_spawn.rq_jid, log_buffer); reply_send(preq); }
static void process_gpu_request_reply( struct work_task *pwt) { char *id = "process_gpu_request_reply"; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code != 0) { sprintf(log_buffer, "MOM failed on GPU request, rc = %d", preq->rq_reply.brp_code); log_err(errno, id, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer); } else { /* record that MOM changed gpu mode */ if (LOGLEVEL >= 7) { sprintf( log_buffer, "GPU control request completed for node %s gpuid %s mode %d reset_perm %d reset_vol %d", preq->rq_ind.rq_gpuctrl.rq_momnode, preq->rq_ind.rq_gpuctrl.rq_gpuid, preq->rq_ind.rq_gpuctrl.rq_gpumode, preq->rq_ind.rq_gpuctrl.rq_reset_perm, preq->rq_ind.rq_gpuctrl.rq_reset_vol); log_ext(-1, id, log_buffer, LOG_INFO); } reply_ack(preq); } }
static void process_checkpoint_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); /* preq handled previously */ if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_manager.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else { /* record that MOM has a checkpoint file */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */ reply_ack(preq); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } /* END process_checkpoint_reply() */
static void post_hold(struct work_task *pwt) { int code; job *pjob; struct batch_request *preq; int conn_idx; if (pwt->wt_aux2 != 1) svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; code = preq->rq_reply.brp_code; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (pwt->wt_aux2 != 1) { /* not rpp */ conn_idx = connection_find_actual_index(preq->rq_conn); if (conn_idx == -1) { req_reject(PBSE_SYSTEM, 0, preq); return; } svr_conn[conn_idx].cn_authen &= ~PBS_NET_CONN_NOTIMEOUT; } pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname); if (pjob == (job *)0) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq); return; } if (code != 0) { if (code != PBSE_CKPBSY) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ if (code != PBSE_NOSUP) { /* a "real" error - log message with return error code */ (void)sprintf(log_buffer, msg_mombadhold, code); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG, pjob->ji_qs.ji_jobid, log_buffer); /* send message back to server for display to user */ reply_text(preq, code, log_buffer); return; } } else if (code == 0) { /* record that MOM has a checkpoint file */ if (preq->rq_reply.brp_auxcode) /* chkpt can be moved */ pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHKPT) | JOB_SVFLG_HASRUN | JOB_SVFLG_ChkptMig; pjob->ji_modified = 1; /* indicate attributes changed */ (void)job_save(pjob, SAVEJOB_QUICK); /* note in accounting file */ account_record(PBS_ACCT_CHKPNT, pjob, (char *)0); } reply_ack(preq); }
static void post_modify_req( struct work_task *pwt) { struct batch_request *preq; job *pjob; char log_buf[LOCAL_LOG_BUF_SIZE]; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = get_remove_batch_request(pwt->wt_parm1); free(pwt->wt_mutex); free(pwt); if (preq == NULL) return; preq->rq_conn = preq->rq_orgconn; /* restore socket to client */ if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID)) { sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_modify.rq_objname, log_buf); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { if (preq->rq_reply.brp_code == PBSE_UNKJOBID) { if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL) { req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); return; } else { if (LOGLEVEL >= 0) { sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s", (pjob->ji_qs.ji_jobid != NULL) ? pjob->ji_qs.ji_jobid : "", PJobState[pjob->ji_qs.ji_state], PJobSubState[pjob->ji_qs.ji_substate], pjob->ji_qs.ji_destin); log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } } reply_ack(preq); } return; } /* END post_modify_req() */
/** * @brief * main - the initialization and main loop of pbs_daemon */ int main(int argc, char *argv[]) { char jobfile[MAXPATHLEN+1]; char jobfile_full[MAXPATHLEN+1]; pbs_net_t hostaddr = 0; int port = -1; int move_type = -1; pbs_list_head attrl; enum conn_type cntype = ToServerDIS; int con = -1; char *destin; int encode_type; int i; job *jobp; char job_id[PBS_MAXSVRJOBID+1]; attribute *pattr; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char script_name[MAXPATHLEN+1]; int in_server = -1; char *param_name, *param_val; char buf[4096]; struct hostent *hp; struct in_addr addr; char *credbuf = NULL; size_t credlen = 0; int prot = PROT_TCP; /*the real deal or output version and exit?*/ execution_mode(argc, argv); /* If we are not run with real and effective uid of 0, forget it */ pbs_loadconf(0); if (!isAdminPrivilege(getlogin())) { fprintf(stderr, "%s: Must be run by root\n", argv[0]); exit(SEND_JOB_FATAL); } /* initialize the pointers in the resource_def array */ for (i = 0; i < (svr_resc_size - 1); ++i) svr_resc_def[i].rs_next = &svr_resc_def[i+1]; /* last entry is left with null pointer */ /* set single threaded mode */ pbs_client_thread_set_single_threaded_mode(); /* disable attribute verification */ set_no_attribute_verification(); /* initialize the thread context */ if (pbs_client_thread_init_thread_context() != 0) { fprintf(stderr, "%s: Unable to initialize thread context\n", argv[0]); exit(SEND_JOB_FATAL); } if(set_msgdaemonname("PBS_send_job")) { fprintf(stderr, "Out of memory\n"); return 1; } winsock_init(); connection_init(); while (fgets(buf, sizeof(buf), stdin) != NULL) { buf[strlen(buf)-1] = '\0'; /* gets rid of newline */ param_name = buf; param_val = strchr(buf, '='); if (param_val) { *param_val = '\0'; param_val++; } else { /* bad param_val -- skipping */ break; } if (strcmp(param_name, "jobfile") == 0) { jobfile[0] = '\0'; strncpy(jobfile, param_val, MAXPATHLEN); } else if (strcmp(param_name, "destaddr") == 0) { hostaddr = atol(param_val); } else if (strcmp(param_name, "destport") == 0) { port = atoi(param_val); } else if (strcmp(param_name, "move_type") == 0) { move_type = atoi(param_val); } else if (strcmp(param_name, "in_server") == 0) { in_server = atoi(param_val); } else if (strcmp(param_name, "server_name") == 0) { server_name[0] = '\0'; strncpy(server_name, param_val, PBS_MAXSERVERNAME); } else if (strcmp(param_name, "server_host") == 0) { server_host[0] = '\0'; strncpy(server_host, param_val, (sizeof(server_host) - 1)); } else if (strcmp(param_name, "server_addr") == 0) { pbs_server_addr = atol(param_val); } else if (strcmp(param_name, "server_port") == 0) { pbs_server_port_dis = atoi(param_val); } else if (strcmp(param_name, "log_file") == 0) { log_file = strdup(param_val); } else if (strcmp(param_name, "path_log") == 0) { path_log[0] = '\0'; strncpy(path_log, param_val, MAXPATHLEN); } else if (strcmp(param_name, "path_jobs") == 0) { path_jobs = strdup(param_val); } else if (strcmp(param_name, "path_spool") == 0) { path_spool = strdup(param_val); } else if (strcmp(param_name, "path_rescdef") == 0) { path_rescdef = strdup(param_val); } else if (strcmp(param_name, "path_users") == 0) { path_users = strdup(param_val); } else if (strcmp(param_name, "path_hooks_workdir") == 0) { path_hooks_workdir = strdup(param_val); if (path_hooks_workdir == NULL) exit(SEND_JOB_FATAL); } else if (strcmp(param_name, "svr_history_enable") == 0) { svr_history_enable = atol(param_val); } else if (strcmp(param_name, "svr_history_duration") == 0) { svr_history_duration = atol(param_val); } else if (strcmp(param_name, "single_signon_password_enable") == 0) { if (decode_b(&server.sv_attr[(int)SRV_ATR_ssignon_enable], NULL, NULL, param_val) != 0) { fprintf(stderr, "%s: failed to set ssignon_password_enable\n", argv[0]); exit(SEND_JOB_FATAL); } } else if (strcmp(param_name, "script_name") == 0) { strncpy(script_name, param_val, MAXPATHLEN + 1); } else break; } time(&time_now); (void)log_open_main(log_file, path_log, 1); /* silent open */ if (setup_resc(1) == -1) { /* log_buffer set in setup_resc */ log_err(-1, "pbsd_send_job(setup_resc)", log_buffer); return (-1); } if( strlen(jobfile) == 0 || hostaddr == 0 || port == 0 || move_type == -1 || \ in_server == -1 || strlen(server_name) == 0 || strlen(server_host) == 0 || \ pbs_server_addr == 0 || pbs_server_port_dis == 0 || \ strlen(path_log) == 0 || path_jobs == NULL || \ path_spool == NULL || path_users == NULL ) { log_err(-1, "pbs_send_job", "error on one of the parameters"); log_close(0); /* silent close */ exit(SEND_JOB_FATAL); } CLEAR_HEAD(task_list_immed); CLEAR_HEAD(task_list_timed); CLEAR_HEAD(task_list_event); CLEAR_HEAD(svr_queues); CLEAR_HEAD(svr_alljobs); CLEAR_HEAD(svr_newjobs); CLEAR_HEAD(svr_allresvs); CLEAR_HEAD(svr_newresvs); CLEAR_HEAD(svr_deferred_req); CLEAR_HEAD(svr_unlicensedjobs); strcpy(jobfile_full, path_jobs); strcat(jobfile_full, jobfile); if (chk_save_file(jobfile_full) != 0) { sprintf(log_buffer, "Error opening jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } if ((jobp=job_recov_fs(jobfile, RECOV_SUBJOB)) == NULL) { sprintf(log_buffer, "Failed to recreate job in jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } /* now delete the temp job file that was created by job_save_fs in server code * jobs are in database now, no need to keep in filesystem */ unlink(jobfile_full); if (in_server) append_link(&svr_alljobs, &jobp->ji_alljobs, jobp); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; svr_dequejob(jobp); } CLEAR_HEAD(attrl); pattr = jobp->ji_wattr; for (i=0; i < (int)JOB_ATR_LAST; i++) { if ((job_attr_def+i)->at_flags & resc_access_perm) { (void)(job_attr_def+i)->at_encode(pattr+i, &attrl, (job_attr_def+i)->at_name, NULL, encode_type, NULL); } } attrl_fixlink(&attrl); /* script name is passed from parent */ /* get host name */ pbs_loadconf(0); addr.s_addr = htonl(hostaddr); hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET); if (hp == NULL) { sprintf(log_buffer, "%s: h_errno=%d", inet_ntoa(addr), h_errno); log_err(-1, __func__, log_buffer); } else { /* read any credential file */ (void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } /* save the job id for when after we purge the job */ (void)strcpy(job_id, jobp->ji_qs.ji_jobid); con = -1; DIS_tcparray_init(); for (i=0; i<RETRY; i++) { pbs_errno = 0; /* connect to receiving server with retries */ if (i > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); if (should_retry_route(pbs_errno) == -1) { goto fatal_exit; /* fatal error, don't retry */ } sleep(1<<i); } if ((con = svr_connect(hostaddr, port, 0, cntype, prot)) == PBS_NET_RC_FATAL) { (void)sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, __func__, log_buffer); goto fatal_exit; } else if (con == PBS_NET_RC_RETRY) { pbs_errno = WSAECONNREFUSED; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the "read-to-commit/commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; destin = jobp->ji_qs.ji_destin; if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL, prot, NULL)== 0) { if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) { /* already running, mark it so */ log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, "Mom reports job already running"); goto ok_exit; } else if ((pbs_errno == PBSE_HOOKERROR) || (pbs_errno == PBSE_HOOK_REJECT) || (pbs_errno == PBSE_HOOK_REJECT_RERUNJOB) || (pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) { char name_buf[MAXPATHLEN+1]; int rfd; int len; char *reject_msg; int err; err = pbs_errno; reject_msg = pbs_geterrmsg(con); (void)snprintf(log_buffer, sizeof(log_buffer), "send of job to %s failed error = %d reject_msg=%s", destin, err, reject_msg?reject_msg:""); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); (void)strcpy(name_buf, path_hooks_workdir); (void)strcat(name_buf, jobp->ji_qs.ji_jobid); (void)strcat(name_buf, HOOK_REJECT_SUFFIX); if ((reject_msg != NULL) && (reject_msg[0] != '\0')) { if ((rfd = open(name_buf, O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) { snprintf(log_buffer, sizeof(log_buffer), "open of reject file %s failed: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } else { secure_file(name_buf, "Administrators", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED); setmode(rfd, O_BINARY); len = strlen(reject_msg)+1; /* write also trailing null char */ if (write(rfd, reject_msg, len) != len) { snprintf(log_buffer, sizeof(log_buffer), "write to file %s incomplete: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } close(rfd); } } if (err == PBSE_HOOKERROR) exit(SEND_JOB_HOOKERR); if (err == PBSE_HOOK_REJECT) exit(SEND_JOB_HOOK_REJECT); if (err == PBSE_HOOK_REJECT_RERUNJOB) exit(SEND_JOB_HOOK_REJECT_RERUNJOB); if (err == PBSE_HOOK_REJECT_DELETEJOB) exit(SEND_JOB_HOOK_REJECT_DELETEJOB); } else { (void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); continue; } } if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, prot, NULL) != 0) continue; } if (credlen > 0) { int ret; ret = PBSD_jcred(con, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, prot, NULL); if ((ret == 0) || (i == (RETRY - 1))) free(credbuf); /* free credbuf if credbuf is sent successfully OR */ /* at the end of all retry attempts */ if (ret != 0) continue; } if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con, jobp, StdOut, prot) != 0) || (move_job_file(con, jobp, StdErr, prot) != 0) || (move_job_file(con, jobp, Chkpt, prot) != 0)) continue; } jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; } if (PBSD_rdytocmt(con, job_id, prot, NULL) != 0) continue; if (PBSD_commit(con, job_id, prot, NULL) != 0) goto fatal_exit; goto ok_exit; /* This child process is all done */ } if (con >= 0) svr_disconnect(con); /* * If connection is actively refused by the execution node(or mother superior) OR * the execution node(or mother superior) is rejecting request with error * PBSE_BADHOST(failing to authorize server host), the node should be marked down. */ if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == WSAECONNREFUSED || pbs_errno == PBSE_BADHOST)) { i = SEND_JOB_NODEDW; } else if (should_retry_route(pbs_errno) == -1) { i = SEND_JOB_FATAL; } else { i = SEND_JOB_RETRY; } (void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); log_close(0); net_close(-1); unlink(script_name); exit(i); fatal_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_FATAL); ok_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_OK); }
int send_request_to_remote_server( int conn, batch_request *request) { struct attropl *patrl; struct svrattrl *psvratl; int rc = PBSE_NONE; int tmp_rc = PBSE_NONE; int sock = 0; char log_buf[LOCAL_LOG_BUF_SIZE]; struct tcp_chan *chan = NULL; pthread_mutex_lock(connection[conn].ch_mutex); sock = connection[conn].ch_socket; pthread_mutex_unlock(connection[conn].ch_mutex); request->rq_conn = sock; if ((chan = DIS_tcp_setup(sock)) == NULL) { log_err(PBSE_MEM_MALLOC, __func__, "Could not allocate memory for socket buffer"); close_conn(sock, FALSE); return(PBSE_MEM_MALLOC); } /* the request is bound to another server, encode/send the request */ switch (request->rq_type) { case PBS_BATCH_DeleteJob: rc = PBSD_mgr_put( conn, PBS_BATCH_DeleteJob, MGR_CMD_DELETE, MGR_OBJ_JOB, request->rq_ind.rq_delete.rq_objname, NULL, NULL); break; case PBS_BATCH_HoldJob: attrl_fixlink(&request->rq_ind.rq_hold.rq_orig.rq_attr); psvratl = (struct svrattrl *)GET_NEXT(request->rq_ind.rq_hold.rq_orig.rq_attr); patrl = &psvratl->al_atopl; rc = PBSD_mgr_put( conn, PBS_BATCH_HoldJob, MGR_CMD_SET, MGR_OBJ_JOB, request->rq_ind.rq_hold.rq_orig.rq_objname, patrl, NULL); break; case PBS_BATCH_CheckpointJob: rc = PBSD_mgr_put( conn, PBS_BATCH_CheckpointJob, MGR_CMD_SET, MGR_OBJ_JOB, request->rq_ind.rq_hold.rq_orig.rq_objname, NULL, NULL); break; case PBS_BATCH_GpuCtrl: rc = PBSD_gpu_put( conn, request->rq_ind.rq_gpuctrl.rq_momnode, request->rq_ind.rq_gpuctrl.rq_gpuid, request->rq_ind.rq_gpuctrl.rq_gpumode, request->rq_ind.rq_gpuctrl.rq_reset_perm, request->rq_ind.rq_gpuctrl.rq_reset_vol, NULL); break; case PBS_BATCH_MessJob: rc = PBSD_msg_put( conn, request->rq_ind.rq_message.rq_jid, request->rq_ind.rq_message.rq_file, request->rq_ind.rq_message.rq_text, NULL); break; case PBS_BATCH_ModifyJob: case PBS_BATCH_AsyModifyJob: attrl_fixlink(&request->rq_ind.rq_modify.rq_attr); patrl = (struct attropl *) & ((struct svrattrl *)GET_NEXT( request->rq_ind.rq_modify.rq_attr))->al_atopl; rc = PBSD_mgr_put( conn, request->rq_type, MGR_CMD_SET, MGR_OBJ_JOB, request->rq_ind.rq_modify.rq_objname, patrl, NULL); break; case PBS_BATCH_Rerun: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_Rerun, msg_daemonname))) break; if ((rc = encode_DIS_JobId(chan, request->rq_ind.rq_rerun))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_RegistDep: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_RegistDep, msg_daemonname))) break; if ((rc = encode_DIS_Register(chan, request))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_AsySignalJob: case PBS_BATCH_SignalJob: rc = PBSD_sig_put( conn, request->rq_ind.rq_signal.rq_jid, request->rq_ind.rq_signal.rq_signame, request->rq_extra); break; case PBS_BATCH_StatusJob: rc = PBSD_status_put( conn, PBS_BATCH_StatusJob, request->rq_ind.rq_status.rq_id, NULL, NULL); break; case PBS_BATCH_TrackJob: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_TrackJob, msg_daemonname))) break; if ((rc = encode_DIS_TrackJob(chan, request))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_ReturnFiles: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_ReturnFiles, msg_daemonname))) break; if ((rc = encode_DIS_ReturnFiles(chan, request))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_CopyFiles: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_CopyFiles, msg_daemonname))) break; if ((rc = encode_DIS_CopyFiles(chan, request))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_DelFiles: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_DelFiles, msg_daemonname))) break; if ((rc = encode_DIS_CopyFiles(chan, request))) break; if ((rc = encode_DIS_ReqExtend(chan, 0))) break; rc = DIS_tcp_wflush(chan); break; case PBS_BATCH_DeleteReservation: if ((rc = encode_DIS_ReqHdr(chan, PBS_BATCH_DeleteReservation, msg_daemonname))) break; if ((rc = encode_DIS_ReqExtend(chan, request->rq_extend))) break; rc = DIS_tcp_wflush(chan); break; default: sprintf(log_buf, msg_issuebad, request->rq_type); log_err(-1, __func__, log_buf); rc = -1; break; } /* END switch (request->rq_type) */ if ((tmp_rc = DIS_reply_read(chan, &request->rq_reply)) != 0) { sprintf(log_buf, "DIS_reply_read failed: %d", tmp_rc); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); request->rq_reply.brp_code = tmp_rc; request->rq_reply.brp_choice = BATCH_REPLY_CHOICE_NULL; } DIS_tcp_cleanup(chan); svr_disconnect(conn); return(rc); } /* END send_request_to_remote_server() */
static void req_stat_job_step2( struct stat_cntl *cntl) /* I/O (freed on return) */ { svrattrl *pal; job *pjob = NULL; struct batch_request *preq; struct batch_reply *preply; int rc = 0; enum TJobStatTypeEnum type; pbs_queue *pque = NULL; int exec_only = 0; int IsTruncated = 0; long DTime; /* delta time - only report full attribute list if J->MTime > DTime */ static svrattrl *dpal = NULL; int job_array_index = 0; job_array *pa = NULL; preq = cntl->sc_origrq; type = (enum TJobStatTypeEnum)cntl->sc_type; preply = &preq->rq_reply; /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */ /* NOTE: If IsTruncated is true, should walk all queues and walk jobs in each queue until max_reported is reached (NYI) */ if (dpal == NULL) { /* build 'delta' attribute list */ svrattrl *tpal; tlist_head dalist; int aindex; int atrlist[] = { JOB_ATR_jobname, JOB_ATR_resc_used, JOB_ATR_LAST }; CLEAR_LINK(dalist); for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++) { if ((tpal = attrlist_create("", "", 23)) == NULL) { return; } tpal->al_valln = atrlist[aindex]; if (dpal == NULL) dpal = tpal; append_link(&dalist, &tpal->al_link, tpal); } } /* END if (dpal == NULL) */ if (type == tjstArray) { pa = get_array(preq->rq_ind.rq_status.rq_id); } if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long) { /* polljobs not set - indicates we may need to obtain fresh data from MOM */ if (cntl->sc_jobid[0] == '\0') pjob = NULL; else pjob = find_job(cntl->sc_jobid); while (1) { if (pjob == NULL) { /* start from the first job */ if (type == tjstJob) { pjob = find_job(preq->rq_ind.rq_status.rq_id); } else if (type == tjstQueue) { pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs); } else if (type == tjstArray) { job_array_index = 0; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) job_array_index++; } else { if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) IsTruncated = TRUE; pjob = (job *)GET_NEXT(svr_alljobs); } } /* END if (pjob == NULL) */ else { /* get next job */ if (type == tjstJob) break; if (type == tjstQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque); else pjob = (job *)GET_NEXT(pjob->ji_alljobs); if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) ; } } if (pjob == NULL) break; /* PBS_RESTAT_JOB defaults to 30 seconds */ if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) && ((time_now - pjob->ji_momstat) > JobStatRate)) { /* go to MOM for status */ strcpy(cntl->sc_jobid, pjob->ji_qs.ji_jobid); if ((rc = stat_to_mom(pjob, cntl)) == PBSE_SYSTEM) { break; } if (rc != 0) { rc = 0; continue; } return; /* will pick up after mom replies */ } } /* END while(1) */ if (cntl->sc_conn >= 0) svr_disconnect(cntl->sc_conn); /* close connection to MOM */ if (rc != 0) { free(cntl); reply_free(preply); req_reject(rc, 0, preq, NULL, "cannot get update from mom"); return; } } /* END if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long) */ /* * now ready for part 3, building the status reply, * loop through again */ if (type == tjstSummarizeArraysQueue || type == tjstSummarizeArraysServer) { update_array_statuses(); } if (type == tjstJob) pjob = find_job(preq->rq_ind.rq_status.rq_id); else if (type == tjstQueue) pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs); else if (type == tjstSummarizeArraysQueue) pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs_array_sum); else if (type == tjstSummarizeArraysServer) pjob = (job *)GET_NEXT(svr_jobs_array_sum); else if (type == tjstArray) { job_array_index = 0; pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) job_array_index++; } else pjob = (job *)GET_NEXT(svr_alljobs); DTime = 0; if (preq->rq_extend != NULL) { char *ptr; /* FORMAT: { EXECQONLY | DELTA:<EPOCHTIME> } */ if (strstr(preq->rq_extend, EXECQUEONLY)) exec_only = 1; ptr = strstr(preq->rq_extend, "DELTA:"); if (ptr != NULL) { ptr += strlen("delta:"); DTime = strtol(ptr, NULL, 10); } } free(cntl); if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue)) { long sentJobCounter; long qjcounter; long qmaxreport; /* loop through all queues */ for (pque = (pbs_queue *)GET_NEXT(svr_queues); pque != NULL; pque = (pbs_queue *)GET_NEXT(pque->qu_link)) { qjcounter = 0; if ((exec_only == 1) && (pque->qu_qs.qu_type != QTYPE_Execution)) { /* ignore routing queues */ continue; } if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) && (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0)) { qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long; } else { qmaxreport = TMAX_JOB; } if (LOGLEVEL >= 5) { sprintf(log_buffer,"giving scheduler up to %ld idle jobs in queue %s\n", qmaxreport, pque->qu_qs.qu_name); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, pque->qu_qs.qu_name, log_buffer); } sentJobCounter = 0; /* loop through jobs in queue */ for (pjob = (job *)GET_NEXT(pque->qu_jobs); pjob != NULL; pjob = (job *)GET_NEXT(pjob->ji_jobque)) { if ((qjcounter >= qmaxreport) && (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)) { /* max_report of queued jobs reached for queue */ continue; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, (pjob->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); return; } sentJobCounter++; if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED) qjcounter++; } /* END for (pjob) */ if (LOGLEVEL >= 5) { sprintf(log_buffer,"sent scheduler %ld total jobs for queue %s\n", sentJobCounter, pque->qu_qs.qu_name); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, pque->qu_qs.qu_name, log_buffer); } } /* END for (pque) */ reply_send(preq); return; } /* END if ((type == tjstTruncatedServer) || ...) */ while (pjob != NULL) { /* go ahead and build the status reply for this job */ if (exec_only) { pque = find_queuebyname(pjob->ji_qs.ji_queue); if (pque->qu_qs.qu_type != QTYPE_Execution) goto nextjob; } pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr); rc = status_job( pjob, preq, pal, &preply->brp_un.brp_status, &bad); if ((rc != 0) && (rc != PBSE_PERM)) { req_reject(rc, bad, preq, NULL, NULL); return; } /* get next job */ nextjob: if (type == tjstJob) break; if (type == tjstQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque); else if (type == tjstSummarizeArraysQueue) pjob = (job *)GET_NEXT(pjob->ji_jobque_array_sum); else if (type == tjstSummarizeArraysServer) pjob = (job *)GET_NEXT(pjob->ji_jobs_array_sum); else if (type == tjstArray) { pjob = NULL; /* increment job_array_index until we find a non-null pointer or hit the end */ while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL) ; } else pjob = (job *)GET_NEXT(pjob->ji_alljobs); rc = 0; } /* END while (pjob != NULL) */ reply_send(preq); if (LOGLEVEL >= 7) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, "req_statjob", "Successfully returned the status of queued jobs\n"); } return; } /* END req_stat_job_step2() */
int stat_to_mom( job *pjob, /* I */ struct stat_cntl *cntl) /* I/O */ { struct batch_request *newrq; int rc; struct work_task *pwt = 0; struct pbsnode *node; if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL) { return(PBSE_SYSTEM); } /* set up status request, save address of cntl in request for later */ newrq->rq_extra = (void *)cntl; if (cntl->sc_type == 1) strcpy(newrq->rq_ind.rq_status.rq_id, pjob->ji_qs.ji_jobid); else newrq->rq_ind.rq_status.rq_id[0] = '\0'; /* get stat of all */ CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr); /* if MOM is down just return stale information */ if (((node = tfind_addr(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)) != NULL) && (node->nd_state & (INUSE_DELETED | INUSE_DOWN))) { if (LOGLEVEL >= 6) { sprintf(log_buffer, "node '%s' is allocated to job but in state '%s'", node->nd_name, (node->nd_state & INUSE_DELETED) ? "deleted" : "down"); log_event( PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } return(PBSE_NORELYMOM); } /* get connection to MOM */ cntl->sc_conn = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, pbs_mom_port, process_Dreply, ToServerDIS); if ((rc = cntl->sc_conn) >= 0) rc = issue_Drequest(cntl->sc_conn, newrq, stat_update, &pwt); if (rc != 0) { /* request failed */ if (pwt) delete_task(pwt); free_br(newrq); if (cntl->sc_conn >= 0) svr_disconnect(cntl->sc_conn); } /* END if (rc != NULL) */ return(rc); } /* END stat_to_mom() */
static void process_hold_reply( struct work_task *pwt) { job *pjob; struct batch_request *preq; int newstate; int newsub; attribute temphold; char *pset; int rc; svr_disconnect(pwt->wt_event); /* close connection to MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if ((pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname)) == (job *)0) { LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_hold.rq_orig.rq_objname, msg_postmomnojob); req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob); } else if (preq->rq_reply.brp_code != 0) { rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold); if (rc == 0) { rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */ pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ if (preq->rq_reply.brp_code != PBSE_NOSUP) { sprintf(log_buffer, msg_mombadhold, preq->rq_reply.brp_code); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer); } else { reply_ack(preq); } } else { /* record that MOM has a checkpoint file */ /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire. * And if these flags are not set, start_exec will not try to run the job from * the checkpoint image file. */ pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */ { pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE; pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE; } pjob->ji_modified = 1; /* indicate attributes changed */ svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); /* saves job */ account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */ reply_ack(preq); } }
/** * * @brief * Send a job over the network to some other server or MOM. * @par * Under Linux/Unix, this starts a child process to do the work. * Connect to the destination host and port, * and go through the protocol to transfer the job. * Signals are blocked. * * @param[in] jobp - pointer to the job being sent. * @param[in] hostaddr - the address of host to send job to, host byte order. * @param[in] port - the destination port, host byte order * @param[in] move_type - the type of move (e.g. MOVE_TYPE_exec) * @param[in] post_func - the function to execute once the child process * sending job completes (Linux/Unix only) * @param[in] data - input data to 'post_func' * * @return int * @retval 2 parent : success (child forked) * @retval -1 parent : on failure (pbs_errno set to error number) * @retval SEND_JOB_OK child : 0 success, job sent * @retval SEND_JOB_FATAL child : 1 permenent failure or rejection, * @retval SEND_JOB_RETRY child : 2 failed but try again * @retval SEND_JOB_NODEDW child : 3 execution node down, retry different node */ int send_job(job *jobp, pbs_net_t hostaddr, int port, int move_type, void (*post_func)(struct work_task *), struct batch_request *preq) { #ifdef WIN32 char cmdline[80]; pio_handles pio; char buf[4096]; struct work_task *ptask; int newstate; int newsub; long tempval; char script_name[MAXPATHLEN+1]; int gridproxy_cred = 0; #ifdef PBS_CRED_GRIDPROXY if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY) gridproxy_cred = 1; #endif if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) { return (send_job_exec(jobp, hostaddr, port, preq)); } sprintf(cmdline, "%s/sbin/pbs_send_job", pbs_conf.pbs_exec_path); if (win_popen(cmdline, "w", &pio, NULL) == 0) { errno = GetLastError(); pbs_errno = errno; (void)sprintf(log_buffer, "executing %s for job %s failed errno=%d", cmdline, jobp->ji_qs.ji_jobid, errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_ERR, jobp->ji_qs.ji_jobid, log_buffer); /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); win_pclose(&pio); return (-1); } ptask = set_task(WORK_Deferred_Child, (long)pio.pi.hProcess, post_func, preq); if (!ptask) { log_err(errno, __func__, msg_err_malloc); errno = ENOMEM; pbs_errno = errno; win_pclose(&pio); /* force re-eval of job state out of Transit */ svr_evaljobstate(jobp, &newstate, &newsub, 1); svr_setjobstate(jobp, newstate, newsub); return (-1); } else { ptask->wt_parm2 = jobp; append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); } script_name[0] = '\0'; /* if job has a script read it from database */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* * copy the job script from database to a temp file * PBSD_jscript works with a file * delete it at the end of the send */ if (svr_create_tmp_jobscript(jobp, &script_name) != 0) { pbs_errno = PBSE_SYSTEM; snprintf(log_buffer, sizeof(log_buffer), "Failed to create temporary job script for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, "send_job", log_buffer); win_pclose2(&pio); return (-1); } } addpid(pio.pi.hProcess); /* our job is to calc eligible time accurately and save it */ /* on new server, accrue type should be calc afresh */ /* Note: if job is being sent for execution on mom, then don't calc eligible time */ if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) && (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) && (move_type != MOVE_TYPE_Exec)) { tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long); jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval; jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; } /* in windows code, a child process "w32_send_job" handles the send * This needs the job information, so we save using the filesystem * This avoids the child process from having to "connect" to the database again * The file is deleted by the send_job child process when it has done recovering the job */ job_save_fs(jobp, SAVEJOB_FULLFORCE); /* so the spawned process can get a fresh copy of job */ if (*jobp->ji_qs.ji_fileprefix != '\0') sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX); else sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_jobid, JOB_FILE_SUFFIX); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "destaddr=%ld\n", hostaddr); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "destport=%d\n", port); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "move_type=%d\n", move_type); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "in_server=%d\n", is_linked(&svr_alljobs, &jobp->ji_alljobs)); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_name=%s\n", (server_name?server_name:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_host=%s\n", (server_host?server_host:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_addr=%ld\n", pbs_server_addr); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "server_port=%d\n", pbs_server_port_dis); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "log_file=%s\n", (log_file?log_file:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_log=%s\n", (path_log?path_log:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_jobs=%s\n", (path_jobs?path_jobs:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_spool=%s\n", (path_spool?path_spool:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_rescdef=%s\n", (path_rescdef?path_rescdef:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_users=%s\n", (path_users?path_users:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "path_hooks_workdir=%s\n", (path_hooks_workdir?path_hooks_workdir:"")); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "svr_history_enable=%ld\n", svr_history_enable); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "svr_history_duration=%ld\n", svr_history_duration); win_pwrite(&pio, buf, strlen(buf)); if ( (server.sv_attr[SRV_ATR_ssignon_enable].at_flags & \ ATR_VFLAG_SET) && \ (server.sv_attr[SRV_ATR_ssignon_enable].at_val.at_long == 1) ) strcpy(buf, "single_signon_password_enable=1\n"); else strcpy(buf, "single_signon_password_enable=0\n"); win_pwrite(&pio, buf, strlen(buf)); sprintf(buf, "script_name=%s\n", script_name); win_pwrite(&pio, buf, strlen(buf)); strcpy(buf, "quit\n"); win_pwrite(&pio, buf, strlen(buf)); win_pclose2(&pio); /* closes all handles except the process handle */ return (2); #else pbs_list_head attrl; enum conn_type cntype = ToServerDIS; int con; char *credbuf = NULL; size_t credlen = 0; char *destin = jobp->ji_qs.ji_destin; int encode_type; int i; char job_id[PBS_MAXSVRJOBID+1]; attribute *pattr; pid_t pid; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char script_name[MAXPATHLEN+1]; struct work_task *ptask; struct hostent *hp; struct in_addr addr; long tempval; int gridproxy_cred = 0; int rpp = 0; #ifdef PBS_CRED_GRIDPROXY if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY) gridproxy_cred = 1; #endif if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) { return (send_job_exec(jobp, hostaddr, port, preq)); } script_name[0] = '\0'; /* if job has a script read it from database */ if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* * copy the job script from database to a temp file * PBSD_jscript works with a file * delete it at the end of the send */ if (svr_create_tmp_jobscript(jobp, script_name) != 0) { pbs_errno = PBSE_SYSTEM; snprintf(log_buffer, sizeof(log_buffer), "Failed to create temporary job script for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, "send_job", log_buffer); return -1; } } pid = fork(); if (pid == -1) { /* Error on fork */ log_err(errno, __func__, "fork failed\n"); pbs_errno = PBSE_SYSTEM; return -1; } if (pid != 0) { /* The parent (main server) */ ptask = set_task(WORK_Deferred_Child, pid, post_func, preq); if (!ptask) { log_err(errno, __func__, msg_err_malloc); return (-1); } else { ptask->wt_parm2 = jobp; append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); } return 2; } /* * the child process * * set up signal cather for error return */ DBPRT(("%s: child started, sending to port %d\n", __func__, port)) rpp_terminate(); /* Unprotect child from being killed by kernel */ daemon_protect(0, PBS_DAEMON_PROTECT_OFF); #ifdef WIN32 /* get host name */ /* * If host address is loopback address then do not resolve with dns * Use "localhost" as the host name. */ if ((htonl(hostaddr) == loopback_addr->sin_addr.s_addr)) { (void)get_credential(LOCALHOST_SHORTNAME, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } else { #endif addr.s_addr = htonl(hostaddr); hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET); if (hp == NULL) { sprintf(log_buffer, "%s: h_errno=%d", inet_ntoa(addr), h_errno); log_err(-1, __func__, log_buffer); } else { /* read any credential file */ (void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } #ifdef WIN32 } #endif /* encode job attributes to be moved */ CLEAR_HEAD(attrl); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; svr_dequejob(jobp); /* clears default resource settings */ } /* our job is to calc eligible time accurately and save it */ /* on new server, accrue type should be calc afresh */ /* Note: if job is being sent for execution on mom, then don't calc eligible time */ if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) && (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) && (move_type != MOVE_TYPE_Exec)) { tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long); jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval; jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE; } pattr = jobp->ji_wattr; for (i=0; i < (int)JOB_ATR_LAST; i++) { if ((job_attr_def+i)->at_flags & resc_access_perm) { (void)(job_attr_def+i)->at_encode(pattr+i, &attrl, (job_attr_def+i)->at_name, (char *)0, encode_type, NULL); } } attrl_fixlink(&attrl); /* save the job id for when after we purge the job */ (void)strcpy(job_id, jobp->ji_qs.ji_jobid); pbs_errno = 0; con = -1; for (i=0; i<RETRY; i++) { /* connect to receiving server with retries */ if (i > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); if (should_retry_route(pbs_errno) == -1) { /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_FATAL); /* fatal error, don't retry */ } sleep(1<<i); } if ((con = svr_connect(hostaddr, port, 0, cntype, rpp)) == PBS_NET_RC_FATAL) { (void)sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, __func__, log_buffer); /* delete the temp script file */ unlink(script_name); if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == PBSE_BADCRED)) exit(SEND_JOB_NODEDW); exit(SEND_JOB_FATAL); } else if (con == PBS_NET_RC_RETRY) { pbs_errno = ECONNREFUSED; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, (char *)0, rpp, NULL) == 0) { if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) { /* already running, mark it so */ log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, "Mom reports job already running"); exit(SEND_JOB_OK); } else if ((pbs_errno == PBSE_HOOKERROR) || (pbs_errno == PBSE_HOOK_REJECT) || (pbs_errno == PBSE_HOOK_REJECT_RERUNJOB) || (pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) { char name_buf[MAXPATHLEN+1]; int rfd; int len; char *reject_msg; int err; err = pbs_errno; reject_msg = pbs_geterrmsg(con); (void)sprintf(log_buffer, "send of job to %s failed error = %d reject_msg=%s", destin, err, reject_msg?reject_msg:""); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); (void)strcpy(name_buf, path_hooks_workdir); (void)strcat(name_buf, jobp->ji_qs.ji_jobid); (void)strcat(name_buf, HOOK_REJECT_SUFFIX); if ((reject_msg != NULL) && (reject_msg[0] != '\0')) { if ((rfd = open(name_buf, O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) { sprintf(log_buffer, "open of reject file %s failed: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } else { #ifdef WIN32 secure_file(name_buf, "Administrators", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED); setmode(rfd, O_BINARY); #endif len = strlen(reject_msg)+1; /* write also trailing null char */ if (write(rfd, reject_msg, len) != len) { sprintf(log_buffer, "write to file %s incomplete: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } close(rfd); } } if (err == PBSE_HOOKERROR) exit(SEND_JOB_HOOKERR); if (err == PBSE_HOOK_REJECT) exit(SEND_JOB_HOOK_REJECT); if (err == PBSE_HOOK_REJECT_RERUNJOB) exit(SEND_JOB_HOOK_REJECT_RERUNJOB); if (err == PBSE_HOOK_REJECT_DELETEJOB) exit(SEND_JOB_HOOK_REJECT_DELETEJOB); } else { (void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); continue; } } if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, rpp, NULL) != 0) continue; } if (credlen > 0) { int ret; ret = PBSD_jcred(con, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, rpp, NULL); if ((ret == 0) || (i == (RETRY - 1))) free(credbuf); /* free credbuf if cred info is sent successfully OR */ /* at the end of all retry attempts */ if (ret != 0) continue; } if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con, jobp, StdOut, rpp, NULL) != 0) || (move_job_file(con, jobp, StdErr, rpp, NULL) != 0) || (move_job_file(con, jobp, Chkpt, rpp, NULL) != 0)) continue; } jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; } if (PBSD_rdytocmt(con, job_id, rpp, NULL) != 0) continue; if (PBSD_commit(con, job_id, rpp, NULL) != 0) { /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_FATAL); } svr_disconnect(con); /* delete the temp script file */ unlink(script_name); exit(SEND_JOB_OK); /* This child process is all done */ } if (con >= 0) svr_disconnect(con); /* * If connection is actively refused by the execution node(or mother superior) OR * the execution node(or mother superior) is rejecting request with error * PBSE_BADHOST(failing to authorize server host), the node should be marked down. */ if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == ECONNREFUSED || pbs_errno == PBSE_BADHOST)) { i = SEND_JOB_NODEDW; } else if (should_retry_route(pbs_errno) == -1) { i = SEND_JOB_FATAL; } else { i = SEND_JOB_RETRY; } (void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); /* delete the temp script file */ unlink(script_name); exit(i); return -1; /* NOT REACHED */ #endif /* !WIN32 */ }
static void post_signal_req( struct work_task *pwt) { job *pjob; struct batch_request *preq; svr_disconnect(pwt->wt_event); /* disconnect from MOM */ preq = pwt->wt_parm1; preq->rq_conn = preq->rq_orgconn; /* restore client socket */ if (preq->rq_reply.brp_code) { log_event( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, preq->rq_ind.rq_signal.rq_jid, pbse_to_txt(PBSE_MOMREJECT)); errno = 0; req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL); } else { pjob = preq->rq_extra; if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0) { if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); /* release resources allocated to suspended job - NORWAY */ free_nodes(pjob); } } else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0) { if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) { /* re-allocate assigned node to resumed job - NORWAY */ set_old_nodes(pjob); pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend; set_statechar(pjob); job_save(pjob, SAVEJOB_QUICK, 0); } } reply_ack(preq); } return; } /* END post_signal_req() */
int send_job( job *jobp, pbs_net_t hostaddr, /* host address, host byte order */ int port, /* service port, host byte order */ int move_type, /* move, route, or execute */ void (*post_func)(struct work_task *), /* after move */ void *data) /* ptr to optional batch_request to be put */ /* in the work task structure */ { tlist_head attrl; enum conn_type cntype = ToServerDIS; int con; char *destin = jobp->ji_qs.ji_destin; int encode_type; int i; int NumRetries; char *id = "send_job"; attribute *pattr; pid_t pid; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char *safail = "sigaction failed\n"; char *spfail = "sigprocmask failed\n"; char script_name[MAXPATHLEN + 1]; sigset_t child_set, all_set; struct sigaction child_action; struct work_task *ptask; mbool_t Timeout = FALSE; char *pc; sigemptyset(&child_set); sigaddset(&child_set, SIGCHLD); sigfillset(&all_set); /* block SIGCHLD until work task is established */ if (sigprocmask(SIG_BLOCK, &child_set, NULL) == -1) { log_err(errno,id,spfail); pbs_errno = PBSE_SYSTEM; log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "cannot set signal mask"); return(ROUTE_PERM_FAILURE); } if (LOGLEVEL >= 6) { sprintf(log_buffer,"about to send job - type=%d", move_type); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "forking in send_job"); } pid = fork(); if (pid == -1) { /* error on fork */ log_err(errno, id, "fork failed\n"); if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); pbs_errno = PBSE_SYSTEM; return(ROUTE_PERM_FAILURE); } if (pid != 0) { /* The parent (main server) */ /* create task to monitor job startup */ /* CRI: need way to report to scheduler job is starting, not started */ ptask = set_task(WORK_Deferred_Child, pid, post_func, jobp); if (ptask == NULL) { log_err(errno, id, msg_err_malloc); return(ROUTE_PERM_FAILURE); } ptask->wt_parm2 = data; append_link( &((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask); /* now can unblock SIGCHLD */ if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); if (LOGLEVEL >= 1) { extern long DispatchTime[]; extern job *DispatchJob[]; extern char *DispatchNode[]; extern time_t time_now; struct pbsnode *NP; /* record job dispatch time */ int jindex; for (jindex = 0;jindex < 20;jindex++) { if (DispatchJob[jindex] == NULL) { DispatchTime[jindex] = time_now; DispatchJob[jindex] = jobp; if ((NP = PGetNodeFromAddr(hostaddr)) != NULL) DispatchNode[jindex] = NP->nd_name; else DispatchNode[jindex] = NULL; break; } } } /* SUCCESS */ return(ROUTE_DEFERRED); } /* END if (pid != 0) */ /* * the child process * * set up signal catcher for error return */ rpp_terminate(); child_action.sa_handler = net_move_die; sigfillset(&child_action.sa_mask); child_action.sa_flags = 0; if (sigaction(SIGHUP, &child_action, NULL)) log_err(errno, id, safail); if (sigaction(SIGINT, &child_action, NULL)) log_err(errno, id, safail); if (sigaction(SIGQUIT, &child_action, NULL)) log_err(errno, id, safail); /* signal handling is set, now unblock */ if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1) log_err(errno, id, spfail); /* encode job attributes to be moved */ CLEAR_HEAD(attrl); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { /* moving job to MOM - ie job start */ resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { /* moving job to alternate server? */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; /* clear default resource settings */ svr_dequejob(jobp); } pattr = jobp->ji_wattr; for (i = 0;i < JOB_ATR_LAST;i++) { if (((job_attr_def + i)->at_flags & resc_access_perm) || ((strncmp((job_attr_def + i)->at_name,"session_id",10) == 0) && (jobp->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))) { (job_attr_def + i)->at_encode( pattr + i, &attrl, (job_attr_def + i)->at_name, NULL, encode_type); } } /* END for (i) */ attrl_fixlink(&attrl); /* put together the job script file name */ strcpy(script_name, path_jobs); if (jobp->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { strcat(script_name, jobp->ji_arraystruct->ai_qs.fileprefix); } else { strcat(script_name, jobp->ji_qs.ji_fileprefix); } strcat(script_name, JOB_SCRIPT_SUFFIX); pbs_errno = 0; con = -1; for (NumRetries = 0;NumRetries < RETRY;NumRetries++) { int rc; /* connect to receiving server with retries */ if (NumRetries > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); /* check pbs_errno from previous attempt */ if (should_retry_route(pbs_errno) == -1) { sprintf(log_buffer, "child failed in previous commit request for job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, id, log_buffer); exit(1); /* fatal error, don't retry */ } sleep(1 << NumRetries); } /* NOTE: on node hangs, svr_connect is successful */ if ((con = svr_connect(hostaddr, port, 0, cntype)) == PBS_NET_RC_FATAL) { sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, id, log_buffer); exit(1); } if (con == PBS_NET_RC_RETRY) { pbs_errno = 0; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the "ready-to-commit/commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; job_save(jobp, SAVEJOB_QUICK); } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; if ((pc = PBSD_queuejob( con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL)) == NULL) { if ((pbs_errno == PBSE_EXPIRED) || (pbs_errno == PBSE_READ_REPLY_TIMEOUT)) { /* queue job timeout based on pbs_tcp_timeout */ Timeout = TRUE; } if ((pbs_errno == PBSE_JOBEXIST) && (move_type == MOVE_TYPE_Exec)) { /* already running, mark it so */ log_event( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, "MOM reports job already running"); exit(0); } sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, jobp->ji_qs.ji_jobid, log_buffer); continue; } /* END if ((pc = PBSD_queuejob() == NULL) */ free(pc); if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, jobp->ji_qs.ji_jobid) != 0) continue; } /* XXX may need to change the logic below, if we are sending the job to a mom on the same host and the mom and server are not sharing the same spool directory, then we still need to move the file */ if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con,jobp,StdOut) != 0) || (move_job_file(con,jobp,StdErr) != 0) || (move_job_file(con,jobp,Checkpoint) != 0)) { continue; } } /* ignore signals */ if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; job_save(jobp, SAVEJOB_QUICK); } else { /* ignore signals */ if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); } if (PBSD_rdytocmt(con, jobp->ji_qs.ji_jobid) != 0) { if (sigprocmask(SIG_UNBLOCK, &all_set, NULL) == -1) log_err(errno, id, "sigprocmask\n"); continue; } if ((rc = PBSD_commit(con, jobp->ji_qs.ji_jobid)) != 0) { int errno2; /* NOTE: errno is modified by log_err */ errno2 = errno; sprintf(log_buffer, "send_job commit failed, rc=%d (%s)", rc, (connection[con].ch_errtxt != NULL) ? connection[con].ch_errtxt : "N/A"); log_ext(errno2, id, log_buffer, LOG_WARNING); /* if failure occurs, pbs_mom should purge job and pbs_server should set * job state to idle w/error msg */ if (errno2 == EINPROGRESS) { /* request is still being processed */ /* increase tcp_timeout in qmgr? */ Timeout = TRUE; /* do we need a continue here? */ sprintf(log_buffer, "child commit request timed-out for job %s, increase tcp_timeout?", jobp->ji_qs.ji_jobid); log_ext(errno2, id, log_buffer, LOG_WARNING); /* don't retry on timeout--break out and report error! */ break; } else { sprintf(log_buffer, "child failed in commit request for job %s", jobp->ji_qs.ji_jobid); log_ext(errno2, id, log_buffer, LOG_CRIT); /* FAILURE */ exit(1); } } /* END if ((rc = PBSD_commit(con,jobp->ji_qs.ji_jobid)) != 0) */ svr_disconnect(con); /* child process is done */ /* SUCCESS */ exit(0); } /* END for (NumRetries) */ if (con >= 0) svr_disconnect(con); if (Timeout == TRUE) { /* 10 indicates that job migrate timed out, server will mark node down * and abort the job - see post_sendmom() */ sprintf(log_buffer, "child timed-out attempting to start job %s", jobp->ji_qs.ji_jobid); log_ext(pbs_errno, id, log_buffer, LOG_WARNING); exit(10); } if (should_retry_route(pbs_errno) == -1) { sprintf(log_buffer, "child failed and will not retry job %s", jobp->ji_qs.ji_jobid); log_err(pbs_errno, id, log_buffer); exit(1); } exit(2); /*NOTREACHED*/ return(ROUTE_SUCCESS); } /* END send_job() */
int send_job_over_network_with_retries( char *job_id, char *job_destin, tlist_head &attrl, bool &attempt_to_queue_job, bool &change_substate_on_attempt_to_queue, bool &timeout, const char *script_name, bool need_to_send_job_script, bool job_has_run, unsigned long job_momaddr, unsigned short job_momport, char *stdout_path, char *stderr_path, char *chkpt_path, int type, int *my_err, int *mom_err) { int con = PBS_NET_RC_UNSET; char log_buf[LOCAL_LOG_BUF_SIZE]; int rc = LOCUTION_RETRY; for (int NumRetries = 0; NumRetries < RETRY; NumRetries++) { /* connect to receiving server with retries */ if (NumRetries > 0) { /* recycle after an error */ if (con >= 0) { svr_disconnect(con); con = PBS_NET_RC_UNSET; } /* check my_err from previous attempt */ if ((should_retry_route(*my_err) == -1) || (should_retry_route(*mom_err) == -1)) { sprintf(log_buf, "child failed in previous commit request for job %s", job_id); log_err(*my_err, __func__, log_buf); break; } sleep(1 << NumRetries); } /* make sure this is zero at the point that we're retrying */ *my_err = 0; if ((con = svr_connect(job_momaddr, job_momport, my_err, NULL, NULL)) == PBS_NET_RC_FATAL) { sprintf(log_buf, "send_job failed to host %s, %lx port %d", (job_destin[0] != '\0') ? job_destin : "unknown host", job_momaddr, job_momport); log_err(*my_err, __func__, log_buf); rc = LOCUTION_FAIL; break; } if (con == PBS_NET_RC_RETRY) { *my_err = 0; /* should retry */ continue; } if (con == PBS_LOCAL_CONNECTION) { log_err(-1, __func__, "attempting to run the job on pbs_server???"); return(PBSE_SYSTEM); } rc = send_job_over_network(job_id, con, job_destin, attrl, attempt_to_queue_job, change_substate_on_attempt_to_queue, timeout, script_name, need_to_send_job_script, job_has_run, job_momaddr, stdout_path, stderr_path, chkpt_path, type, my_err, mom_err); if (rc == LOCUTION_SUCCESS) break; } /* END for (NumRetries) */ if (con >= 0) svr_disconnect(con); return(rc); } /* END send_job_over_network_with_retries() */