void req_checkpointjob( struct batch_request *preq) { job *pjob; int rc; attribute *pattr; if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL) { return; } if (is_cloud_job(pjob)) { rc = PBSE_CLOUD_REQUEST; req_reject(rc, 0, preq, NULL, "cloud jobs cannot be checkpointed"); } pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, preq, process_checkpoint_reply)) != 0) { req_reject(rc, 0, preq, NULL, NULL); } else { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } } else { /* Job does not have checkpointing enabled, so reject the request */ LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable"); } } /* END req_checkpointjob() */
void hold_job( attribute *temphold, /* I */ void *j) /* I */ { long *hold_val; long old_hold; int newstate; int newsub; attribute *pattr; job *pjob = (job *)j; if (pjob == NULL) return; hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold->at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* TODO */ /* preq_tmp = alloc_br(preq->rq_type); */ } else if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } }
/** * * csv_find_value * * Search a csv list for an entry that matches a specified search string. * Assuming that this is entry of the form "attribute=value", return a * pointer to the start of the value string. */ char * csv_find_value(const char *csv_str, const char *search_str) { char *cp; char *vp; cp = csv_find_string(csv_str, search_str); if (cp) { if ((vp = strchr(cp, '='))) return(vp + 1); } return(NULL); }
void *req_checkpointjob( batch_request *preq) /* I */ { job *pjob; int rc; pbs_attribute *pattr; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL) { return(NULL); } mutex_mgr job_mutex(pjob->ji_mutex, true); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { req_reject(rc, 0, preq, NULL, NULL); free_br(dup_req); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pjob = NULL; } else job_mutex.set_unlock_on_exit(false); process_checkpoint_reply(dup_req); } } else { /* Job does not have checkpointing enabled, so reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable"); } return(NULL); } /* END req_checkpointjob() */
int req_holdjob( batch_request *vp) /* I */ { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; pbs_attribute temphold; pbs_attribute *pattr; batch_request *preq = (struct batch_request *)vp; char log_buf[LOCAL_LOG_BUF_SIZE]; batch_request *dup_req = NULL; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return(PBSE_NONE); } mutex_mgr job_mutex(pjob->ji_mutex, true); /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return(PBSE_NONE); } hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ /* ** The jobid in the request always have the server suffix attached ** which is dropped when the server attribute ** 'display_job_server_suffix' is FALSE and so will in the MOM's. ** Therefore, it must be passed as the server to the MOM so she can ** find it to hold. */ if (strncmp(pjob->ji_qs.ji_jobid, preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID)) snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", pjob->ji_qs.ji_jobid); if ((dup_req = duplicate_request(preq)) == NULL) { req_reject(rc, 0, preq, NULL, "memory allocation failure"); } /* The dup_req is freed in relay_to_mom (failure) * or in issue_Drequest (success) */ else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE) { free_br(dup_req); *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, "relay to mom failed"); if (pjob == NULL) job_mutex.set_unlock_on_exit(false); } else { if (pjob != NULL) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK, 0); /* fill in log_buf again, since relay_to_mom changed it */ sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL); pjob = NULL; reply_ack(preq); } else job_mutex.set_unlock_on_exit(false); process_hold_reply(dup_req); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(*pjob, newstate, newsub, 0); svr_setjobstate(pjob, newstate, newsub, FALSE); } reply_ack(preq); } return(PBSE_NONE); } /* END req_holdjob() */
void svr_shutdown( int type) /* I */ { attribute *pattr; job *pjob; job *pnxt; long *state; /* Lets start by logging shutdown and saving everything */ state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long; strcpy(log_buffer, msg_shutdown_start); if (*state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { *state = SV_STATE_DOWN; strcat(log_buffer, "Forced"); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); return; } } if (type == SHUT_IMMEDIATE) { *state = SV_STATE_SHUTIMM; strcat(log_buffer, "Immediate"); } else if (type == SHUT_DELAY) { *state = SV_STATE_SHUTDEL; strcat(log_buffer, "Delayed"); } else if (type == SHUT_QUICK) { *state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ strcat(log_buffer, "Quick"); } else { *state = SV_STATE_SHUTIMM; strcat(log_buffer, "By Signal"); } log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */ { return; } svr_save(&server, SVR_SAVE_QUICK); pnxt = (job *)GET_NEXT(svr_alljobs); while ((pjob = pnxt) != NULL) { pnxt = (job *)GET_NEXT(pjob->ji_alljobs); if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint]; if ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL))) { /* do checkpoint of job */ if (shutdown_checkpoint(pjob) == 0) continue; } /* if no checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(pjob, msg_on_shutdown); } } return; } /* END svr_shutdown() */
int process_request( struct tcp_chan *chan) /* file descriptor (socket) to get request */ { int rc = PBSE_NONE; struct batch_request *request = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; long acl_enable = FALSE; long state = SV_STATE_DOWN; time_t time_now = time(NULL); int free_request = TRUE; char tmpLine[MAXLINE]; char *auth_err = NULL; enum conn_type conn_active; unsigned short conn_socktype; unsigned short conn_authen; unsigned long conn_addr; int sfds = chan->sock; pthread_mutex_lock(svr_conn[sfds].cn_mutex); conn_active = svr_conn[sfds].cn_active; conn_socktype = svr_conn[sfds].cn_socktype; conn_authen = svr_conn[sfds].cn_authen; conn_addr = svr_conn[sfds].cn_addr; svr_conn[sfds].cn_lasttime = time_now; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); if ((request = alloc_br(0)) == NULL) { snprintf(tmpLine, sizeof(tmpLine), "cannot allocate memory for request from %lu", conn_addr); req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_SYSTEM; goto process_request_cleanup; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (conn_active == FromClientDIS || conn_active == ToServerDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((conn_socktype & PBS_SOCK_UNIX) && (conn_authen != PBS_NET_CONN_AUTHENTICATED)) { /* get_creds interestingly always returns 0 */ get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(chan, request); } else { char out[80]; snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", conn_active,conn_socktype, netaddr_long(conn_addr, out)); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", tmpLine); snprintf(tmpLine, sizeof(tmpLine), "request on invalid type of connection (%d) from %s", conn_active, netaddr_long(conn_addr, out)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (rc == -1) { /* FAILURE */ /* premature end of file */ rc = PBSE_PREMATURE_EOF; goto process_request_cleanup; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ goto process_request_cleanup; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); free_request = FALSE; goto process_request_cleanup; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { sprintf(log_buf, "%s: %lu", pbse_to_txt(PBSE_BADHOST), conn_addr); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", conn_addr); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (LOGLEVEL >= 1) { sprintf(log_buf, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf); } /* is the request from a host acceptable to the server */ if (conn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable); if (acl_enable) { /* acl enabled, check it; always allow myself and nodes */ struct array_strings *pas = NULL; struct pbsnode *isanode; get_svr_attr_arst(SRV_ATR_acl_hosts, &pas); isanode = PGetNodeFromAddr(conn_addr); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0)) { char tmpLine[MAXLINE]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); free_request = FALSE; rc = PBSE_BADHOST; goto process_request_cleanup; } if (isanode != NULL) unlock_node(isanode, "process_request", NULL, LOGLEVEL); } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ conn_credent[sfds].timestamp = time_now; request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (conn_socktype == PBS_SOCK_INET) { rc = PBSE_IVALREQ; req_reject(rc, 0, request, NULL, NULL); free_request = FALSE; goto process_request_cleanup; } } if (conn_socktype & PBS_SOCK_UNIX) { pthread_mutex_lock(svr_conn[sfds].cn_mutex); svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; pthread_mutex_unlock(svr_conn[sfds].cn_mutex); } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = PBSE_NONE; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if (request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); free_request = FALSE; goto process_request_cleanup; } else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (conn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); free_request = FALSE; goto process_request_cleanup; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ get_svr_attr_l(SRV_ATR_State, &state); if (state > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); rc = PBSE_SVRDOWN; free_request = FALSE; goto process_request_cleanup; /*NOTREACHED*/ break; } } /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ rc = dispatch_request(sfds, request); return(rc); process_request_cleanup: if (free_request == TRUE) free_br(request); return(rc); } /* END process_request() */
void process_request( int sfds) /* file descriptor (socket) to get request */ { #ifdef PBS_MOM char *id = "process_request"; #endif int rc; struct batch_request *request = NULL; #ifndef PBS_MOM char *auth_err = NULL; #endif time_now = time(NULL); request = alloc_br(0); request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ #ifndef PBS_MOM if (svr_conn[sfds].cn_active == FromClientDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) && (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED)) { get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(sfds, request); } else { LOG_EVENT( PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* FAILURE */ /* premature end of file */ close_client(sfds); free_br(request); return; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); close_client(sfds); return; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { char tmpLine[1024]; sprintf(log_buffer, "%s: %lu", pbse_to_txt(PBSE_BADHOST), get_connectaddr(sfds)); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buffer); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", get_connectaddr(sfds)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); return; } if (LOGLEVEL >= 1) { sprintf( log_buffer, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); LOG_EVENT(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buffer); } /* is the request from a host acceptable to the server */ #ifndef PBS_MOM if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } if (server.sv_attr[SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself and nodes */ struct pbsnode *isanode; isanode = PGetNodeFromAddr(get_connectaddr(sfds)); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check( &server.sv_attr[SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0)) { char tmpLine[1024]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); close_client(sfds); return; } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (svr_conn[sfds].cn_socktype == PBS_SOCK_INET) return; } if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = 0; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if ( request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); if (rc == PBSE_NONE) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } return; } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); close_client(sfds); return; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if (((pjob = find_job(request->rq_ind.rq_modify.rq_objname)) != (job *)0) && (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); return; /*NOTREACHED*/ break; } } #else /* THIS CODE FOR MOM ONLY */ { /*extern tree *okclients; */ extern void mom_server_update_receive_time_by_ip(u_long ipaddr, const char *cmd); /* check connecting host against allowed list of ok clients */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "request type %s from host %s received", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } /* if (!tfind(svr_conn[sfds].cn_addr, &okclients)) */ if (!AVL_is_in_tree(svr_conn[sfds].cn_addr, 0, okclients)) { sprintf(log_buffer, "request type %s from host %s rejected (host not authorized)", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); req_reject(PBSE_BADHOST, 0, request, NULL, "request not authorized"); close_client(sfds); return; } if (LOGLEVEL >= 3) { sprintf(log_buffer, "request type %s from host %s allowed", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } mom_server_update_receive_time_by_ip(svr_conn[sfds].cn_addr, reqtype_to_txt(request->rq_type)); } /* END BLOCK */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* END else !PBS_MOM */ /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; } /* END process_request() */
void svr_shutdown( int type) /* I */ { pbs_attribute *pattr; job *pjob; long state = SV_STATE_DOWN; int iter; char log_buf[LOCAL_LOG_BUF_SIZE]; close(lockfds); save_queues(); /* Lets start by logging shutdown and saving everything */ get_svr_attr_l(SRV_ATR_State, &state); strcpy(log_buf, msg_shutdown_start); if (state == SV_STATE_SHUTIMM) { /* if already shuting down, another Immed/sig will force it */ if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) { state = SV_STATE_DOWN; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Forced"); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); return; } } if (type == SHUT_IMMEDIATE) { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Immediate"); } else if (type == SHUT_DELAY) { state = SV_STATE_SHUTDEL; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Delayed"); } else if (type == SHUT_QUICK) { state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */ set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "Quick"); } else { state = SV_STATE_SHUTIMM; set_svr_attr(SRV_ATR_State, &state); strcat(log_buf, "By Signal"); } log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */ { return; } svr_save(&server, SVR_SAVE_QUICK); iter = -1; while ((pjob = next_job(&alljobs,&iter)) != NULL) { if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN; pattr = &pjob->ji_wattr[JOB_ATR_checkpoint]; if ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL))) { /* do checkpoint of job */ if (shutdown_checkpoint(&pjob) == 0) { if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); continue; } } /* if no checkpoint (not supported, not allowed, or fails */ /* rerun if possible, else kill job */ rerun_or_kill(&pjob, msg_on_shutdown); } if (pjob != NULL) unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } return; } /* END svr_shutdown() */
void req_holdjob( struct batch_request *preq) { long *hold_val; int newstate; int newsub; long old_hold; job *pjob; char *pset; int rc; attribute temphold; attribute *pattr; pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq); if (pjob == NULL) { return; } if (is_cloud_job(pjob)) { req_reject(PBSE_CLOUD_REQUEST,0,preq,NULL,NULL); } /* cannot do anything until we decode the holds to be set */ if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } /* if other than HOLD_u is being set, must have privil */ if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) { req_reject(rc, 0, preq, NULL, NULL); return; } hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long; old_hold = *hold_val; *hold_val |= temphold.at_val.at_long; pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint]; if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && ((pattr->at_flags & ATR_VFLAG_SET) && ((csv_find_string(pattr->at_val.at_str, "s") != NULL) || (csv_find_string(pattr->at_val.at_str, "c") != NULL) || (csv_find_string(pattr->at_val.at_str, "enabled") != NULL)))) { /* have MOM attempt checkpointing */ if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, preq, process_hold_reply)) != 0) { *hold_val = old_hold; /* reset to the old value */ req_reject(rc, 0, preq, NULL, NULL); } else { pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE; job_save(pjob, SAVEJOB_QUICK); /* fill in log_buffer again, since relay_to_mom changed it */ sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user, preq->rq_host); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } } #ifdef ENABLE_BLCR else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) { /* * This system is configured with BLCR checkpointing to be used, * but this Running job does not have checkpointing enabled, * so we reject the request */ LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not held since checkpointing is expected but not enabled for job"); } #endif else { /* everything went well, may need to update the job state */ LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); if (old_hold != *hold_val) { /* indicate attributes changed */ pjob->ji_modified = 1; svr_evaljobstate(pjob, &newstate, &newsub, 0); svr_setjobstate(pjob, newstate, newsub); } reply_ack(preq); } } /* END req_holdjob() */