void req_commit( struct batch_request *preq) /* I */ { job *pj; pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", "cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; #ifdef HAVE_GLITE_LB svr_logjobstate(pj, JOB_STATE_RUNNING, JOB_SUBSTATE_PRERUN, preq); #endif /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "starting job execution"); } // force poll of other jobs to prevent memory crashes last_poll_time = 0; start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, 0, tmpLine); } else { reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit); } if (is_cloud_job(pj) && pj->ji_numnodes == 1) { if (cloud_exec(pj, 1) == 0) cloud_set_running(pj); if (mom_do_poll(pj) != 0) append_link(&mom_polljobs, &pj->ji_jobque, pj); } job_save(pj, SAVEJOB_FULL); /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
/** * @brief * que_purge - purge queue from system * The queue is dequeued, the queue file is unlinked. * If the queue contains any jobs, the purge is not allowed. * Eventually the queue is deleted from the database * * @param[in] pque - The pointer to the queue to purge * * @return error code * @retval 0 - queue purged or queue not valid * @retval PBSE_OBJBUSY - queue deletion not allowed */ int que_purge(pbs_queue *pque) { pbs_db_obj_info_t obj; pbs_db_que_info_t dbque; pbs_db_conn_t *conn = (pbs_db_conn_t *) svr_db_conn; /* * If the queue (pque) is not valid, then nothing to * do, just return 0. */ if (pque == NULL) return (0); /* are there any jobs still in the queue */ if (pque->qu_numjobs != 0) { /* * If the queue still has job(s), check if the SERVER * is configured for history info and all the jobs in * queue are history jobs. If yes, then allow queue * deletion otherwise return PBSE_OBJBUSY. */ if (svr_history_enable) { /* SVR histconf chk */ job *pjob = (job *)0; job *nxpjob = (job *)0; pjob = (job *)GET_NEXT(pque->qu_jobs); while (pjob) { /* * If it is not a history job (MOVED/FINISHED), then * return with PBSE_OBJBUSY error. */ if ((pjob->ji_qs.ji_state != JOB_STATE_MOVED) && (pjob->ji_qs.ji_state != JOB_STATE_FINISHED)) return (PBSE_OBJBUSY); pjob = (job *)GET_NEXT(pjob->ji_jobque); } /* * All are history jobs, unlink all of them from queue. * Update the number of jobs in the queue and their state * count as the queue is going to be purged. No job(s) * should point to the queue to be purged, make the queue * header pointer of job(pjob->ji_qhdr) to NULL. */ pjob = (job *)GET_NEXT(pque->qu_jobs); while (pjob) { nxpjob = (job *)GET_NEXT(pjob->ji_jobque); delete_link(&pjob->ji_jobque); --pque->qu_numjobs; --pque->qu_njstate[pjob->ji_qs.ji_state]; pjob->ji_qhdr = (pbs_queue *)0; pjob = nxpjob; } } else { return (PBSE_OBJBUSY); } } /* delete queue from database */ strcpy(dbque.qu_name, pque->qu_qs.qu_name); obj.pbs_db_obj_type = PBS_DB_QUEUE; obj.pbs_db_un.pbs_db_que = &dbque; if (pbs_db_delete_obj(conn, &obj) != 0) { (void)sprintf(log_buffer, "delete of que %s from datastore failed", pque->qu_qs.qu_name); log_err(errno, "queue_purge", log_buffer); } que_free(pque); return (0); }
void req_quejob( struct batch_request *preq) /* ptr to the decoded request */ { char *id = "req_quejob"; char basename[PBS_JOBBASE + 1]; int created_here = 0; int index; char *jid; attribute_def *pdef; job *pj; svrattrl *psatl; int rc; int sock = preq->rq_conn; int IsCheckpoint = 0; /* set basic (user) level access permission */ resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat; if (PBSNodeCheckProlog) { check_state(1); mom_server_all_update_stat(); if (internal_state & INUSE_DOWN) { req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL); return; } } if (preq->rq_fromsvr) { /* from another server - accept the extra attributes */ resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM; jid = preq->rq_ind.rq_queuejob.rq_jid; } else { /* request must be from server */ log_err(errno, id, "request not from server"); req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server"); return; } /* does job already exist, check both old and new jobs */ if ((pj = find_job(jid)) == NULL) { pj = (job *)GET_NEXT(svr_newjobs); while (pj != NULL) { if (!strcmp(pj->ji_qs.ji_jobid, jid)) break; pj = (job *)GET_NEXT(pj->ji_alljobs); } } /* * New job ... * * for MOM - rather than make up a hashname, we use the name sent * to us by the server as an attribute. */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (!strcmp(psatl->al_name,ATTR_hashname)) { strcpy(basename,psatl->al_value); break; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); } if (pj != NULL) { /* newly queued job already exists */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { /* FAILURE - job exists and is running */ log_err(errno,id,"cannot queue new job, job exists and is running"); req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running"); return; } /* if checkpointed, then keep old and skip rest of process */ if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) { IsCheckpoint = 1; } /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */ else { /* unlink job from svr_alljobs since it will be placed on newjobs */ delete_link(&pj->ji_alljobs); } } /* END if (pj != NULL) */ else { /* if not already here, allocate job struct */ if ((pj = job_alloc()) == NULL) { /* FAILURE */ req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure"); return; } } /* END else (pj != NULL) */ if (IsCheckpoint == 0) { strcpy(pj->ji_qs.ji_jobid,jid); strcpy(pj->ji_qs.ji_fileprefix,basename); pj->ji_modified = 1; pj->ji_qs.ji_svrflags = created_here; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; } /* decode attributes from request into job structure */ psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr); while (psatl != NULL) { if (IsCheckpoint == 1) { if (strcmp(psatl->al_name,ATTR_checkpoint_name) && strcmp(psatl->al_name,ATTR_v)) { psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } } /* identify the attribute by name */ index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST); if (index < 0) { /* FAILURE */ #if 0 /* old implementation, refuse jobs with unknown attributes */ /* didn`t recognize the name */ job_purge(pj); /* CRI - 12/20/2004 */ reply_badattr(PBSE_NOATTR,1,psatl,preq); return; #endif /* new implementation, ignore unknown attributes */ sprintf(log_buffer, "Unknown attribute \"%s\" received in req_quejob request", psatl->al_name); LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, log_buffer); psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } pdef = &job_attr_def[index]; /* Is attribute not writeable by manager or by a server? */ if ((pdef->at_flags & resc_access_perm) == 0) { /* FAILURE */ job_purge(pj); reply_badattr(PBSE_ATTRRO,1,psatl,preq); return; } /* decode attribute */ if (!strcmp(psatl->al_name,ATTR_v)) { rc = decode_arst_merge( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } else { rc = pdef->at_decode( &pj->ji_wattr[index], psatl->al_name, psatl->al_resc, psatl->al_value); } if (rc != 0) { /* FAILURE */ /* all errors are fatal for MOM */ if (rc != PBSE_UNKRESC) { job_purge(pj); reply_badattr(rc,1,psatl,preq); return; } psatl = (svrattrl *)GET_NEXT(psatl->al_link); continue; } if (psatl->al_op == DFLT) { if (psatl->al_resc) { #if 0 /* don't mark resources as default on nodes, we need all resources stored in the jobfile */ resource *presc; resource_def *prdef; prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size); if (prdef == NULL) { job_purge(pj); reply_badattr(rc,1,psatl, preq); return; } presc = find_resc_entry(&pj->ji_wattr[index],prdef); if (presc != NULL) presc->rs_value.at_flags |= ATR_VFLAG_DEFLT; #endif } else { pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT; } } /* END if (psatl->al_op == DFLT) */ psatl = (svrattrl *)GET_NEXT(psatl->al_link); } /* END while (psatl != NULL) */ if (IsCheckpoint == 1) { pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0) { delete_link(&pj->ji_alljobs); append_link(&svr_newjobs,&pj->ji_alljobs,pj); pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* Per Eric R., req_mvjobfile was giving error in open_std_file, showed up as fishy error message */ if (pj->ji_grpcache != NULL) { free(pj->ji_grpcache); pj->ji_grpcache = NULL; } } else { close_conn(sock); } /* SUCCESS */ return; } /* set remaining job structure elements */ pj->ji_qs.ji_state = JOB_STATE_TRANSIT; pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN; pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now; pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW; pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock; pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock); pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0; /* acknowledge the request with the job id */ if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0) { /* reply failed, purge the job and close the connection */ close_conn(sock); job_purge(pj); return; } /* link job into server's new jobs list request */ append_link(&svr_newjobs, &pj->ji_alljobs, pj); return; } /* END req_quejob() */
int acct_job( job *pjob, /* I */ dynamic_string *ds) /* O */ { int rc; long cray_enabled = FALSE; int resc_access_perm = READ_ONLY; char local_buf[MAXLINE*4]; pbs_queue *pque; tlist_head attrlist; svrattrl *pal; if (pjob == NULL) { return(PBSE_NONE); } CLEAR_HEAD(attrlist); /* user */ /* acct_job is only called from account_jobstr and account_jobend. BufSize should be PBS_ACCT_MAX_RCD + 1 in size. */ sprintf(local_buf, "user=%s ", pjob->ji_wattr[JOB_ATR_euser].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* group */ sprintf(local_buf, "group=%s ", pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* account */ if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "account=%s ", pjob->ji_wattr[JOB_ATR_account].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } /* job name */ sprintf(local_buf, "jobname=%s ", pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); if ((pque = get_jobs_queue(&pjob)) != NULL) { /* queue name */ sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name); unlock_queue(pque, __func__, NULL, LOGLEVEL); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } else if (pjob == NULL) { log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1"); return(PBSE_JOBNOTFOUND); } /* create time */ sprintf(local_buf, "ctime=%ld ", pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* queued time */ sprintf(local_buf, "qtime=%ld ", pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* eligible time, how long ready to run */ sprintf(local_buf, "etime=%ld ", pjob->ji_wattr[JOB_ATR_etime].at_val.at_long); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* execution start time */ sprintf(local_buf, "start=%ld ", (long)pjob->ji_qs.ji_stime); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* user */ sprintf(local_buf, "owner=%s ", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); /* For large clusters strings can get pretty long. We need to see if there is a need to allocate a bigger buffer */ /* execution host name */ append_dynamic_string(ds, "exec_host="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET)) { append_dynamic_string(ds, "login_node="); append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); } /* now encode the job's resource_list pbs_attribute */ job_attr_def[JOB_ATR_resource].at_encode( &pjob->ji_wattr[JOB_ATR_resource], &attrlist, job_attr_def[JOB_ATR_resource].at_name, NULL, ATR_ENCODE_CLIENT, resc_access_perm); while ((pal = GET_NEXT(attrlist)) != NULL) { /* exec_host can use a lot of buffer space. Use a dynamic string */ append_dynamic_string(ds, pal->al_name); if (pal->al_resc != NULL) { append_dynamic_string(ds, "."); append_dynamic_string(ds, pal->al_resc); } append_dynamic_string(ds, "="); append_dynamic_string(ds, pal->al_value); if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE) return(rc); delete_link(&pal->al_link); free(pal); } /* END while (pal != NULL) */ #ifdef ATTR_X_ACCT /* x attributes */ if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET) { sprintf(local_buf, "x=%s ", pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str); if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE) return(rc); } #endif /* SUCCESS */ return(PBSE_NONE); } /* END acct_job() */
void req_commit( struct batch_request *preq) /* I */ { unsigned int momport = 0; int rc; job *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "committing job"); } if (pj == NULL) { req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL); return; } if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM) { log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state"); req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL); return; } /* move job from new job list to "all" job list, set to running state */ delete_link(&pj->ji_alljobs); append_link(&svr_alljobs, &pj->ji_alljobs, pj); /* ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior. */ pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE; pj->ji_qs.ji_state = JOB_STATE_RUNNING; pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN; pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM; pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE); pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0; /* For MOM - start up the job (blocks) */ if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", "req_commit:starting job execution"); } rc = start_exec(pj); if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:job execution started"); } /* if start request fails, reply with failure string */ if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING) { char tmpLine[1024]; if ((pj->ji_hosts != NULL) && (pj->ji_nodekill >= 0) && (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL)) { sprintf(tmpLine, "start failed on node %s", pj->ji_hosts[pj->ji_nodekill].hn_host); } else { sprintf(tmpLine, "start failed on unknown node"); } if (LOGLEVEL >= 6) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL", tmpLine); } reply_text(preq, rc, tmpLine); } else { reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text); } if (multi_mom) { momport = pbs_rm_port; } job_save(pj, SAVEJOB_FULL, momport); #ifdef NVIDIA_GPUS /* * Does this job have a gpuid assigned? * if so, then update gpu status */ if ((use_nvidia_gpu) && ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath, * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure * pbs_server got these attr values. This worked fine before TORQUE modified * job launched into an async process. At 2.0.0p6, a new pbs_attribute "SEND" flag * was added to handle this process. */ return; } /* END req_commit() */
void free_br( struct batch_request *preq) { delete_link(&preq->rq_link); reply_free(&preq->rq_reply); if (preq->rq_extend) free(preq->rq_extend); switch (preq->rq_type) { case PBS_BATCH_QueueJob: free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr); break; case PBS_BATCH_JobCred: if (preq->rq_ind.rq_jobcred.rq_data) free(preq->rq_ind.rq_jobcred.rq_data); break; case PBS_BATCH_MvJobFile: case PBS_BATCH_jobscript: if (preq->rq_ind.rq_jobfile.rq_data) free(preq->rq_ind.rq_jobfile.rq_data); break; case PBS_BATCH_HoldJob: freebr_manage(&preq->rq_ind.rq_hold.rq_orig); break; case PBS_BATCH_CheckpointJob: freebr_manage(&preq->rq_ind.rq_manager); break; case PBS_BATCH_MessJob: if (preq->rq_ind.rq_message.rq_text) free(preq->rq_ind.rq_message.rq_text); break; case PBS_BATCH_ModifyJob: case PBS_BATCH_AsyModifyJob: freebr_manage(&preq->rq_ind.rq_modify); break; case PBS_BATCH_StatusJob: case PBS_BATCH_StatusQue: case PBS_BATCH_StatusNode: case PBS_BATCH_StatusSvr: /* DIAGTODO: handle PBS_BATCH_StatusDiag */ free_attrlist(&preq->rq_ind.rq_status.rq_attr); break; case PBS_BATCH_JobObit: free_attrlist(&preq->rq_ind.rq_jobobit.rq_attr); break; case PBS_BATCH_CopyFiles: case PBS_BATCH_DelFiles: freebr_cpyfile(&preq->rq_ind.rq_cpyfile); break; default: /* NO-OP */ break; } /* END switch (preq->rq_type) */ free(preq); return; } /* END free_br() */
int save_attr( struct attribute_def *padef, /* pbs_attribute definition array */ pbs_attribute *pattr, /* ptr to pbs_attribute value array */ int numattr, /* number of attributes in array */ int fds, char *buf_ptr, /* M */ size_t *space_remaining, /* O */ size_t buf_size) /* I */ { svrattrl dummy; int errct = 0; tlist_head lhead; int i; int resc_access_perm = ATR_DFLAG_ACCESS; svrattrl *pal; int rc; /* encode each pbs_attribute which has a value (not non-set) */ CLEAR_HEAD(lhead); for (i = 0;i < numattr;i++) { if ((padef + i)->at_type != ATR_TYPE_ACL) { /* NOTE: access lists are not saved this way */ rc = (padef + i)->at_encode( pattr + i, &lhead, (padef + i)->at_name, NULL, ATR_ENCODE_SAVE, resc_access_perm); if (rc < 0) errct++; (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY; /* now that it has been encoded, block and save it */ while ((pal = (svrattrl *)GET_NEXT(lhead)) != NULL) { if (save_struct((char *)pal, pal->al_tsize, fds, buf_ptr, space_remaining, buf_size) < 0) errct++; delete_link(&pal->al_link); free(pal); } } } /* END for (i) */ /* indicate last of attributes by writing dummy entry */ memset(&dummy, 0, sizeof(dummy)); dummy.al_tsize = ENDATTRIBUTES; if (save_struct((char *)&dummy, sizeof(dummy), fds, buf_ptr, space_remaining, buf_size) < 0) errct++; if (errct != 0) { return(-1); } /* SUCCESS */ return(0); } /* END save_attr() */
/* delete a job array struct from memory and disk. This is used when the number * of jobs that belong to the array becomes zero. * returns zero if there are no errors, non-zero otherwise */ int array_delete( job_array *pa) { int i; char path[MAXPATHLEN + 1]; char log_buf[LOCAL_LOG_BUF_SIZE]; array_request_node *rn; struct array_depend *pdep; struct array_depend_job *pdj; /* first thing to do is take this out of the servers list of all arrays */ remove_array(pa); /* unlock the mutex and free it */ unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); free(pa->ai_mutex); /* delete the on disk copy of the struct */ snprintf(path, sizeof(path), "%s%s%s", path_arrays, pa->ai_qs.fileprefix, ARRAY_FILE_SUFFIX); if (unlink(path)) { sprintf(log_buf, "unable to delete %s", path); log_err(errno, "array_delete", log_buf); } /* clear array request linked list */ for (rn = (array_request_node *)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node *)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } /* free the memory for the job pointers */ for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) free(pa->job_ids[i]); } free(pa->job_ids); /* free the dependencies, if any */ for (pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps); pdep != NULL; pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps)) { delete_link(&pdep->dp_link); for (pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs); pdj != NULL; pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs)) { delete_link(&pdj->dc_link); free(pdj); } free(pdep); } /* purge the "template" job, this also deletes the shared script file for the array*/ if (pa->ai_qs.parent_id[0] != '\0') { job *pjob; if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL) svr_job_purge(pjob); } /* free the memory allocated for the struct */ free(pa); return(PBSE_NONE); } /* END array_delete() */
/* array_recov reads in an array struct saved to disk and inserts it into the servers list of arrays */ int array_recov( char *path, job_array **new_pa) { job_array *pa; array_request_node *rn; char log_buf[LOCAL_LOG_BUF_SIZE]; int fd; int old_version; int num_tokens; int i; int len; int rc; *new_pa = NULL; old_version = ARRAY_QS_STRUCT_VERSION; /* allocate the storage for the struct */ pa = (job_array*)calloc(1,sizeof(job_array)); if (pa == NULL) { return(PBSE_SYSTEM); } /* initialize the linked list nodes */ CLEAR_HEAD(pa->request_tokens); fd = open(path, O_RDONLY, 0); if(fd < 0) { free(pa); return(PBSE_SYSTEM); } if (array_259_upgrade) { rc = read_and_convert_259_array(fd, pa, path); if (rc != PBSE_NONE) { free(pa); close(fd); return(rc); } } else { /* read the file into the struct previously allocated. */ len = read_ac_socket(fd, &(pa->ai_qs), sizeof(pa->ai_qs)); if ((len < 0) || ((len < (int)sizeof(pa->ai_qs)) && (pa->ai_qs.struct_version == ARRAY_QS_STRUCT_VERSION))) { sprintf(log_buf, "error reading %s", path); log_err(errno, __func__, log_buf); free(pa); close(fd); return(PBSE_SYSTEM); } if (pa->ai_qs.struct_version != ARRAY_QS_STRUCT_VERSION) { rc = array_upgrade(pa, fd, pa->ai_qs.struct_version, &old_version); if (rc) { sprintf(log_buf, "Cannot upgrade array version %d to %d", pa->ai_qs.struct_version, ARRAY_QS_STRUCT_VERSION); log_err(errno, __func__, log_buf); free(pa); close(fd); return(rc); } } } pa->job_ids = (char **)calloc(pa->ai_qs.array_size, sizeof(char *)); /* check to see if there is any additional info saved in the array file */ /* check if there are any array request tokens that haven't been fully processed */ if (old_version > 1) { if (read_ac_socket(fd, &num_tokens, sizeof(int)) != sizeof(int)) { sprintf(log_buf, "error reading token count from %s", path); log_err(errno, __func__, log_buf); free(pa); close(fd); return(PBSE_SYSTEM); } for (i = 0; i < num_tokens; i++) { rn = (array_request_node *)calloc(1, sizeof(array_request_node)); if (read_ac_socket(fd, rn, sizeof(array_request_node)) != sizeof(array_request_node)) { sprintf(log_buf, "error reading array_request_node from %s", path); log_err(errno, __func__, log_buf); free(rn); for (rn = (array_request_node*)GET_NEXT(pa->request_tokens); rn != NULL; rn = (array_request_node*)GET_NEXT(pa->request_tokens)) { delete_link(&rn->request_tokens_link); free(rn); } free(pa); close(fd); return(PBSE_SYSTEM); } CLEAR_LINK(rn->request_tokens_link); append_link(&pa->request_tokens, &rn->request_tokens_link, (void*)rn); } } close(fd); CLEAR_HEAD(pa->ai_qs.deps); if (old_version != ARRAY_QS_STRUCT_VERSION) { /* resave the array struct if the version on disk is older than the current */ array_save(pa); } pa->ai_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(pa->ai_mutex,NULL); lock_ai_mutex(pa, __func__, NULL, LOGLEVEL); /* link the struct into the servers list of job arrays */ insert_array(pa); *new_pa = pa; return(PBSE_NONE); } /* END array_recov() */
void mom_job_purge( job *pjob) /* I (modified) */ { job_file_delete_info *jfdi; jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info)); if (jfdi == NULL) { log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion"); return; } #ifdef NVIDIA_GPUS /* * Did this job have a gpuid assigned? * if so, then update gpu status */ if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) && (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL)) { send_update_soon(); } #endif /* NVIDIA_GPUS */ /* initialize struct information */ if (pjob->ji_flags & MOM_HAS_TMPDIR) { jfdi->has_temp_dir = TRUE; pjob->ji_flags &= ~MOM_HAS_TMPDIR; } else jfdi->has_temp_dir = FALSE; strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid); strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix); if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) && (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)) jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str); jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid; jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid; if (thread_unlink_calls == TRUE) enqueue_threadpool_request(delete_job_files,jfdi); else delete_job_files(jfdi); /* remove this job from the global queue */ delete_link(&pjob->ji_jobque); delete_link(&pjob->ji_alljobs); if (LOGLEVEL >= 6) { sprintf(log_buffer,"removing job"); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } #if IBM_SP2==2 /* IBM SP PSSP 3.1 */ unload_sp_switch(pjob); #endif /* IBM SP */ mom_job_free(pjob); /* if no jobs are left, check if MOM should be restarted */ if (((job *)GET_NEXT(svr_alljobs)) == NULL) MOMCheckRestart(); return; } /* END mom_job_purge() */
int add_encoded_attributes( xmlNodePtr *attr_node, /* M attribute node */ pbs_attribute *pattr) /* M ptr to pbs_attribute value array */ { tlist_head lhead; int i; int resc_access_perm = ATR_DFLAG_ACCESS; svrattrl *pal; int rc = PBSE_NONE; xmlNodePtr attributeNode = *attr_node; char buf[BUFSIZE]; xmlNodePtr pal_xmlNode; CLEAR_HEAD(lhead); xmlNodePtr resource_list_head_node = NULL; xmlNodePtr resource_used_head_node = NULL; xmlNodePtr complete_req_head_node = NULL; for (i = 0; ((i < JOB_ATR_LAST) && (rc >= 0)); i++) { if ((job_attr_def[i].at_type != ATR_TYPE_ACL) && ((pattr + i)->at_flags & ATR_VFLAG_SET)) { if ((i != JOB_ATR_resource) && (i != JOB_ATR_resc_used) && (i != JOB_ATR_req_information)) { std::string value; #ifndef PBS_MOM if (i == JOB_ATR_depend) translate_dependency_to_string(pattr + i, value); else #endif attr_to_str(value, job_attr_def + i, pattr[i], true); if (value.size() == 0) continue; pal_xmlNode = xmlNewChild(attributeNode, NULL, (xmlChar *)job_attr_def[i].at_name, (const xmlChar *)value.c_str()); if (pal_xmlNode) { snprintf(buf, sizeof(buf), "%u", (unsigned int)pattr[i].at_flags); xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf); (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY; } } else { rc = job_attr_def[i].at_encode(pattr + i, &lhead, job_attr_def[i].at_name, NULL, ATR_ENCODE_SAVE, resc_access_perm); if (rc < 0) return -1; (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY; while ((pal = (svrattrl *)GET_NEXT(lhead)) != NULL) { if (i == JOB_ATR_resource) { pal_xmlNode = add_resource_list_attribute(ATTR_l, attr_node, &resource_list_head_node, pal); } else if (i == JOB_ATR_req_information) { pal_xmlNode = add_resource_list_attribute(ATTR_req_information, attr_node, &complete_req_head_node, pal); } else { pal_xmlNode = add_resource_list_attribute(ATTR_used, attr_node, &resource_used_head_node, pal); } if (pal_xmlNode) { snprintf(buf, sizeof(buf), "%u", (unsigned int)pal->al_flags); xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf); } delete_link(&pal->al_link); free(pal); if (!pal_xmlNode) rc = -1; } } } } return (0); } /* END add_encoded_attributes */
void free_br(struct batch_request *preq) { delete_link(&preq->rq_link); reply_free(&preq->rq_reply); if (preq->rq_parentbr) { /* * have a parent who has the original info, so we cannot * free any data malloc-ed outside of the basic structure; * decrement the reference count in the parent and when it * goes to zero, reply_send() it */ if (preq->rq_parentbr->rq_refct > 0) { if (--preq->rq_parentbr->rq_refct == 0) reply_send(preq->rq_parentbr); } if (preq->rppcmd_msgid) free(preq->rppcmd_msgid); (void)free(preq); return; } /* * IMPORTANT - free any data that is malloc-ed outside of the * basic batch_request structure below here so it is not freed * when a copy of the structure (for a Array subjob) is freed */ if (preq->rq_extend) (void)free(preq->rq_extend); switch (preq->rq_type) { case PBS_BATCH_QueueJob: free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr); break; case PBS_BATCH_JobCred: if (preq->rq_ind.rq_jobcred.rq_data) (void)free(preq->rq_ind.rq_jobcred.rq_data); break; case PBS_BATCH_UserCred: if (preq->rq_ind.rq_usercred.rq_data) (void)free(preq->rq_ind.rq_usercred.rq_data); break; case PBS_BATCH_jobscript: if (preq->rq_ind.rq_jobfile.rq_data) (void)free(preq->rq_ind.rq_jobfile.rq_data); break; case PBS_BATCH_CopyHookFile: if (preq->rq_ind.rq_hookfile.rq_data) (void)free(preq->rq_ind.rq_hookfile.rq_data); break; case PBS_BATCH_HoldJob: freebr_manage(&preq->rq_ind.rq_hold.rq_orig); break; case PBS_BATCH_MessJob: if (preq->rq_ind.rq_message.rq_text) (void)free(preq->rq_ind.rq_message.rq_text); break; case PBS_BATCH_RelnodesJob: if (preq->rq_ind.rq_relnodes.rq_node_list) (void)free(preq->rq_ind.rq_relnodes.rq_node_list); break; case PBS_BATCH_PySpawn: arrayfree(preq->rq_ind.rq_py_spawn.rq_argv); arrayfree(preq->rq_ind.rq_py_spawn.rq_envp); break; case PBS_BATCH_ModifyJob: case PBS_BATCH_ModifyResv: freebr_manage(&preq->rq_ind.rq_modify); break; case PBS_BATCH_RunJob: case PBS_BATCH_AsyrunJob: case PBS_BATCH_StageIn: case PBS_BATCH_ConfirmResv: if (preq->rq_ind.rq_run.rq_destin) (void)free(preq->rq_ind.rq_run.rq_destin); break; case PBS_BATCH_StatusJob: case PBS_BATCH_StatusQue: case PBS_BATCH_StatusNode: case PBS_BATCH_StatusSvr: case PBS_BATCH_StatusSched: case PBS_BATCH_StatusHook: case PBS_BATCH_StatusRsc: case PBS_BATCH_StatusResv: if (preq->rq_ind.rq_status.rq_id) free(preq->rq_ind.rq_status.rq_id); free_attrlist(&preq->rq_ind.rq_status.rq_attr); break; case PBS_BATCH_CopyFiles: case PBS_BATCH_DelFiles: freebr_cpyfile(&preq->rq_ind.rq_cpyfile); break; case PBS_BATCH_CopyFiles_Cred: case PBS_BATCH_DelFiles_Cred: freebr_cpyfile_cred(&preq->rq_ind.rq_cpyfile_cred); break; case PBS_BATCH_MvJobFile: if (preq->rq_ind.rq_jobfile.rq_data) free(preq->rq_ind.rq_jobfile.rq_data); break; #ifndef PBS_MOM /* Server Only */ case PBS_BATCH_SubmitResv: free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr); break; case PBS_BATCH_Manager: freebr_manage(&preq->rq_ind.rq_manager); break; case PBS_BATCH_ReleaseJob: freebr_manage(&preq->rq_ind.rq_release); break; case PBS_BATCH_Rescq: case PBS_BATCH_ReserveResc: case PBS_BATCH_ReleaseResc: free_rescrq(&preq->rq_ind.rq_rescq); break; case PBS_BATCH_DefSchReply: free(preq->rq_ind.rq_defrpy.rq_id); free(preq->rq_ind.rq_defrpy.rq_txt); break; case PBS_BATCH_SelectJobs: case PBS_BATCH_SelStat: free_attrlist(&preq->rq_ind.rq_select.rq_selattr); free_attrlist(&preq->rq_ind.rq_select.rq_rtnattr); break; case PBS_BATCH_PreemptJobs: free(preq->rq_ind.rq_preempt.ppj_list); break; #endif /* PBS_MOM */ } if (preq->rppcmd_msgid) free(preq->rppcmd_msgid); (void)free(preq); }