static job *locate_new_job( int sock, /* I */ char *jobid) /* I (optional) */ { job *pj; pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if ((pj->ji_qs.ji_un.ji_newt.ji_fromsock == -1) || ((pj->ji_qs.ji_un.ji_newt.ji_fromsock == sock) && (pj->ji_qs.ji_un.ji_newt.ji_fromaddr == get_connectaddr(sock)))) { if ((jobid != NULL) && (*jobid != '\0')) { if (!strncmp(pj->ji_qs.ji_jobid, jobid, PBS_MAXSVRJOBID)) { /* requested job located */ break; } } else if (pj->ji_qs.ji_un.ji_newt.ji_fromsock == -1) { /* empty job slot located */ break; } else { /* matching job slot located */ break; } } /* END if ((pj->ji_qs.ji_un.ji_newt.ji_fromsock == -1) || ...) */ pj = (job *)GET_NEXT(pj->ji_alljobs); } /* END while(pj != NULL) */ /* return job slot located (NULL on FAILURE) */ return(pj); } /* END locate_new_job() */
void process_request(int sfds) { int rc; struct batch_request *request; conn_t *conn; time_now = time(NULL); conn = get_conn(sfds); if (!conn) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "did not find socket in connection table"); #ifdef WIN32 (void)closesocket(sfds); #else (void)close(sfds); #endif return; } if ((request = alloc_br(0)) == NULL) { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_request", "Unable to allocate request structure"); close_conn(sfds); return; } request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME)) { (void)sprintf(log_buffer, "%s: %lu", msg_reqbadhost, get_connectaddr(sfds)); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); req_reject(PBSE_BADHOST, 0, request); return; } #ifndef PBS_MOM if (conn->cn_active == FromClientDIS) { rc = dis_request_read(sfds, request); } else { log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* End of file */ close_client(sfds); free_br(request); return; } else if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } else if (rc > 0) { /* * request didn't decode, either garbage or unknown * request type, in ether case, return reject-reply */ req_reject(rc, 0, request); close_client(sfds); return; } #ifndef PBS_MOM /* If the request is coming on the socket we opened to the */ /* scheduler, change the "user" from "root" to "Scheduler" */ if (find_sched_from_sock(request->rq_conn) != NULL) { strncpy(request->rq_user, PBS_SCHED_DAEMON_NAME, PBS_MAXUSER); request->rq_user[PBS_MAXUSER] = '\0'; } #endif /* PBS_MOM */ (void)sprintf(log_buffer, msg_request, request->rq_type, request->rq_user, request->rq_host, sfds); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, LOG_DEBUG, "", log_buffer); /* is the request from a host acceptable to the server */ if (request->rq_type == PBS_BATCH_AuthExternal) { rc = authenticate_external(conn, request); if (rc == 0) reply_ack(request); else if (rc == -2) req_reject(PBSE_NOSUP, 0, request); else req_reject(PBSE_BADCRED, 0, request); return; } #ifndef PBS_MOM if (server.sv_attr[(int)SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself */ struct pbsnode *isanode = NULL; if ((server.sv_attr[SRV_ATR_acl_host_moms_enable].at_flags & ATR_VFLAG_SET) && (server.sv_attr[(int)SRV_ATR_acl_host_moms_enable].at_val.at_long == 1)) { isanode = find_nodebyaddr(get_connectaddr(sfds)); if ((isanode != NULL) && (isanode->nd_state & INUSE_DELETED)) isanode = NULL; } if (isanode == NULL) { if ((acl_check(&server.sv_attr[(int)SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0) && (strcasecmp(server_host, request->rq_host) != 0)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (conn->cn_authen & PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by a Authenticate User Request, * if not, reject request and close connection. * -- The following is retained for compat with old cmds -- * The exception to this is of course the Connect Request which * cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no * prior authentication on the Authenticate User request either, * but it comes over a reserved port and appears from another * server, hence is automatically granted authorization. */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); return; } if ((conn->cn_authen & PBS_NET_CONN_AUTHENTICATED) ==0) { rc = PBSE_BADCRED; } else { rc = authenticate_user(request, conn); } if (rc != 0) { req_reject(rc, 0, request); if (rc == PBSE_BADCRED) close_client(sfds); return; } request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_UserCred: case PBS_BATCH_UserMigrate: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request); return; } } #else /* THIS CODE FOR MOM ONLY */ /* check connecting host against allowed list of ok clients */ if (!addrfind(conn->cn_addr)) { req_reject(PBSE_BADHOST, 0, request); close_client(sfds); return; } request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; }
void req_commit( struct batch_request *preq) /* I */ { job *pj; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "starting job execution"); } start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, 0, tmpLine); } else { reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit); } job_save(pj, SAVEJOB_FULL); /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void req_quejob( struct batch_request *preq) /* ptr to the decoded request */ { char *id = "req_quejob"; char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; if (PBSNodeCheckProlog) { check_state(1); mom_server_all_update_stat(); if (internal_state & INUSE_DOWN) { req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL); return; } } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, id, "request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { strcpy(basename,psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno,id,"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* unlink job from svr_alljobs since it will be placed on newjobs */ delete_link(&pj->ji_alljobs); } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR,1,psatl,preq); return; } pdef = &job_attr_def[index]; /* Is attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ job_purge(pj); reply_badattr(PBSE_ATTRRO,1,psatl,preq); return; } /* decode attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ job_purge(pj); reply_badattr(rc,1,psatl,preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { job_purge(pj); reply_badattr(rc,1,psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { delete_link(&pj->ji_alljobs); append_link(&svr_newjobs,&pj->ji_alljobs,pj); pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ close_conn(sock); job_purge(pj); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END req_quejob() */
void process_request( int sfds) /* file descriptor (socket) to get request */ { #ifdef PBS_MOM char *id = "process_request"; #endif int rc; struct batch_request *request = NULL; #ifndef PBS_MOM char *auth_err = NULL; #endif time_now = time(NULL); request = alloc_br(0); request->rq_conn = sfds; /* * Read in the request and decode it to the internal request structure. */ #ifndef PBS_MOM if (svr_conn[sfds].cn_active == FromClientDIS) { #ifdef ENABLE_UNIX_SOCKETS if ((svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) && (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED)) { get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname); } #endif /* END ENABLE_UNIX_SOCKETS */ rc = dis_request_read(sfds, request); } else { LOG_EVENT( PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, "process_req", "request on invalid type of connection"); close_conn(sfds); free_br(request); return; } #else /* PBS_MOM */ rc = dis_request_read(sfds, request); #endif /* PBS_MOM */ if (rc == -1) { /* FAILURE */ /* premature end of file */ close_client(sfds); free_br(request); return; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ close_client(sfds); free_br(request); return; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); close_client(sfds); return; } if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0) { char tmpLine[1024]; sprintf(log_buffer, "%s: %lu", pbse_to_txt(PBSE_BADHOST), get_connectaddr(sfds)); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buffer); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", get_connectaddr(sfds)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); return; } if (LOGLEVEL >= 1) { sprintf( log_buffer, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, sfds); LOG_EVENT(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buffer); } /* is the request from a host acceptable to the server */ #ifndef PBS_MOM if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { strcpy(request->rq_host, server_name); } if (server.sv_attr[SRV_ATR_acl_host_enable].at_val.at_long) { /* acl enabled, check it; always allow myself and nodes */ struct pbsnode *isanode; isanode = PGetNodeFromAddr(get_connectaddr(sfds)); if ((isanode == NULL) && (strcmp(server_host, request->rq_host) != 0) && (acl_check( &server.sv_attr[SRV_ATR_acl_hosts], request->rq_host, ACL_Host) == 0)) { char tmpLine[1024]; snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s", request->rq_host); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); close_client(sfds); return; } } /* * determine source (user client or another server) of request. * set the permissions granted to the client */ if (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) { /* request came from another server */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR; } else { /* request not from another server */ request->rq_fromsvr = 0; /* * Client must be authenticated by an Authenticate User Request, if not, * reject request and close connection. -- The following is retained for * compat with old cmds -- The exception to this is of course the Connect * Request which cannot have been authenticated, because it contains the * needed ticket; so trap it here. Of course, there is no prior * authentication on the Authenticate User request either, but it comes * over a reserved port and appears from another server, hence is * automatically granted authentication. * * The above is only true with inet sockets. With unix domain sockets, the * user creds were read before the first dis_request_read call above. * We automatically granted authentication because we can trust the socket * creds. Authorization is still granted in svr_get_privilege below */ if (request->rq_type == PBS_BATCH_Connect) { req_connect(request); if (svr_conn[sfds].cn_socktype == PBS_SOCK_INET) return; } if (svr_conn[sfds].cn_socktype & PBS_SOCK_UNIX) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } if (ENABLE_TRUSTED_AUTH == TRUE ) rc = 0; /* bypass the authentication of the user--trust the client completely */ else if (munge_on) { /* If munge_on is true we will validate the connection now */ if ( request->rq_type == PBS_BATCH_AltAuthenUser) { rc = req_altauthenuser(request); if (rc == PBSE_NONE) { conn_credent[sfds].timestamp = time_now; svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED; } return; } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) /* skip checking user if we did not get an authenticated credential */ rc = PBSE_BADCRED; else { rc = authenticate_user(request, &conn_credent[sfds], &auth_err); } } else if (svr_conn[sfds].cn_authen != PBS_NET_CONN_AUTHENTICATED) rc = PBSE_BADCRED; else rc = authenticate_user(request, &conn_credent[sfds], &auth_err); if (rc != 0) { req_reject(rc, 0, request, NULL, auth_err); if (auth_err != NULL) free(auth_err); close_client(sfds); return; } /* * pbs_mom and checkpoint restart scripts both need the authority to do * alters and releases on checkpointable jobs. Allow manager permission * for root on the jobs execution node. */ if (((request->rq_type == PBS_BATCH_ModifyJob) || (request->rq_type == PBS_BATCH_ReleaseJob)) && (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0)) { job *pjob; char *dptr; int skip = FALSE; char short_host[PBS_MAXHOSTNAME+1]; /* make short host name */ strcpy(short_host, request->rq_host); if ((dptr = strchr(short_host, '.')) != NULL) { *dptr = '\0'; } if (((pjob = find_job(request->rq_ind.rq_modify.rq_objname)) != (job *)0) && (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)) { if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) && ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) || (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) && (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL)) { request->rq_perm = svr_get_privilege(request->rq_user, server_host); skip = TRUE; } } if (!skip) { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } else { request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host); } } /* END else (svr_conn[sfds].cn_authen == PBS_NET_CONN_FROM_PRIVIL) */ /* if server shutting down, disallow new jobs and new running */ if (server.sv_attr[SRV_ATR_State].at_val.at_long > SV_STATE_RUN) { switch (request->rq_type) { case PBS_BATCH_AsyrunJob: case PBS_BATCH_JobCred: case PBS_BATCH_MoveJob: case PBS_BATCH_QueueJob: case PBS_BATCH_RunJob: case PBS_BATCH_StageIn: case PBS_BATCH_jobscript: req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL); return; /*NOTREACHED*/ break; } } #else /* THIS CODE FOR MOM ONLY */ { /*extern tree *okclients; */ extern void mom_server_update_receive_time_by_ip(u_long ipaddr, const char *cmd); /* check connecting host against allowed list of ok clients */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "request type %s from host %s received", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } /* if (!tfind(svr_conn[sfds].cn_addr, &okclients)) */ if (!AVL_is_in_tree(svr_conn[sfds].cn_addr, 0, okclients)) { sprintf(log_buffer, "request type %s from host %s rejected (host not authorized)", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); req_reject(PBSE_BADHOST, 0, request, NULL, "request not authorized"); close_client(sfds); return; } if (LOGLEVEL >= 3) { sprintf(log_buffer, "request type %s from host %s allowed", reqtype_to_txt(request->rq_type), request->rq_host); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, id, log_buffer); } mom_server_update_receive_time_by_ip(svr_conn[sfds].cn_addr, reqtype_to_txt(request->rq_type)); } /* END BLOCK */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; #endif /* END else !PBS_MOM */ /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ dispatch_request(sfds, request); return; } /* END process_request() */
void req_commit( struct batch_request *preq) /* I */ { unsigned int momport = 0; int rc; job *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); alljobs_list.push_back(pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:starting job execution"); rc = start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, tmpLine); } reply_text(preq, rc, tmpLine); } else { reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text); } if (multi_mom) { momport = pbs_rm_port; } job_save(pj, SAVEJOB_FULL, momport); #ifdef NVIDIA_GPUS /* * Does this job have a gpuid assigned? * if so, then update gpu status */ if ((use_nvidia_gpu) && ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new pbs_attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void mom_req_quejob( batch_request *preq) /* ptr to the decoded request */ { char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ int resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; memset(basename, 0, sizeof(basename)); if (PBSNodeCheckProlog) { check_state(1); if (internal_state & INUSE_DOWN) { req_reject(PBSE_BADMOMSTATE, 0, preq, NULL, NULL); return; } } if (reject_job_submit == TRUE) { req_reject(-1, 0, preq, NULL, "This mom is configured not to run jobs"); return; } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, __func__, (char *)"request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = mom_find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an pbs_attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { snprintf(basename, sizeof(basename), "%s", psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (basename[0] == '\0') snprintf(basename, sizeof(basename), "%s", jid); if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno, __func__, (char *)"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST, 0, preq, NULL, "job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* reject the job. It is already working here. */ sprintf(log_buffer, "Job already exists. State: %d substate: %d", pj->ji_qs.ji_state, pj->ji_qs.ji_substate); log_err(-1, __func__, log_buffer); sprintf(log_buffer, "Job %s already on mom", pj->ji_qs.ji_jobid); req_reject(PBSE_JOBEXIST, 0, preq, NULL, log_buffer); return; } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; /* changing the union type overwrites the euid for the job, and if * ji_grpcache is set this potentially allows jobs to run as root. Unsetting * ji_grpcache fixes this problem --dbeer */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the pbs_attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ /* didn`t recognize the name */ mom_job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR, 1, psatl, preq); return; } pdef = &job_attr_def[index]; /* Is pbs_attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ mom_job_purge(pj); reply_badattr(PBSE_ATTRRO, 1, psatl, preq); return; } /* decode pbs_attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value, resc_access_perm); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ mom_job_purge(pj); reply_badattr(rc, 1, psatl, preq); return; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { mom_job_purge(pj); reply_badattr(rc, 1, psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { remove_from_job_list(pj); append_link(&svr_newjobs,&pj->ji_alljobs,pj); if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock, FALSE); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ // call mom_job_purge first so that double-frees don't happen // when the on_close function is called mom_job_purge(pj); close_conn(sock, FALSE); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END mom_req_quejob() */
void *mom_process_request( void *sock_num) /* file descriptor (socket) to get request */ { int rc; struct batch_request *request = NULL; int sfds = *(int *)sock_num; struct tcp_chan *chan = NULL; time_now = time(NULL); if ((request = alloc_br(0)) == NULL) { mom_close_client(sfds); return NULL; } request->rq_conn = sfds; if ((chan = DIS_tcp_setup(sfds)) == NULL) { mom_close_client(sfds); free_br(request); return NULL; } /* Read in the request and decode it to the internal request structure. */ rc = dis_request_read(chan, request); if (rc == -1) { /* FAILURE */ /* premature end of file */ mom_close_client(chan->sock); free_br(request); DIS_tcp_cleanup(chan); return NULL; } if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE)) { /* FAILURE */ /* read error, likely cannot send reply so just disconnect */ /* ??? not sure about this ??? */ mom_close_client(chan->sock); free_br(request); DIS_tcp_cleanup(chan); return NULL; } if (rc > 0) { /* FAILURE */ /* * request didn't decode, either garbage or unknown * request type, in either case, return reject-reply */ req_reject(rc, 0, request, NULL, "cannot decode message"); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (get_connecthost(chan->sock, request->rq_host, PBS_MAXHOSTNAME) != 0) { char tmpLine[MAXLINE]; sprintf(log_buffer, "%s: %lu", pbse_to_txt(PBSE_BADHOST), get_connectaddr(chan->sock,FALSE)); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buffer); snprintf(tmpLine, sizeof(tmpLine), "cannot determine hostname for connection from %lu", get_connectaddr(chan->sock,FALSE)); req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (LOGLEVEL >= 1) { sprintf( log_buffer, msg_request, reqtype_to_txt(request->rq_type), request->rq_user, request->rq_host, chan->sock); log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buffer); } /* is the request from a host acceptable to the server */ { /*extern tree *okclients; */ extern void mom_server_update_receive_time_by_ip(u_long ipaddr, const char *cmd); /* check connecting host against allowed list of ok clients */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "request type %s from host %s received", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } if (!AVL_is_in_tree_no_port_compare(svr_conn[chan->sock].cn_addr, 0, okclients)) { sprintf(log_buffer, "request type %s from host %s rejected (host not authorized)", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); req_reject(PBSE_BADHOST, 0, request, NULL, "request not authorized"); mom_close_client(chan->sock); DIS_tcp_cleanup(chan); return NULL; } if (LOGLEVEL >= 3) { sprintf(log_buffer, "request type %s from host %s allowed", reqtype_to_txt(request->rq_type), request->rq_host); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer); } mom_server_update_receive_time_by_ip(svr_conn[chan->sock].cn_addr, reqtype_to_txt(request->rq_type)); } /* END BLOCK */ request->rq_fromsvr = 1; request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; /* * dispatch the request to the correct processing function. * The processing function must call reply_send() to free * the request struture. */ mom_dispatch_request(chan->sock, request); DIS_tcp_cleanup(chan); return NULL; } /* END mom_process_request() */