Exemple #1
0
void
req_messagejob(struct batch_request *preq)
  {
  job   *pjob;
  int    rc;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == 0)
    return;

  if (is_cloud_job(pjob))
    {
    rc = PBSE_CLOUD_REQUEST;
    req_reject(rc, 0, preq, NULL, NULL);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    return;
    }

  /* pass the request on to MOM */

  if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                         preq, post_message_req)))
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  }
Exemple #2
0
void
req_messagejob(struct batch_request *preq)
{
	int               jt;            /* job type */
	job		 *pjob;
	int		  rc;

	if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq, &jt)) == 0)
		return;

	if (jt != IS_ARRAY_NO) {
		reply_text(preq, PBSE_NOSUP, "not supported for Array Jobs");
		return;
	}

	/* the job must be running */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	/* pass the request on to MOM */

	rc = relay_to_mom(pjob, preq, post_message_req);
	if (rc)
		req_reject(rc, 0, preq);	/* unable to get to MOM */

	/* After MOM acts and replies to us, we pick up in post_message_req() */
}
static int svr_stagein(

  job                  *pjob,     /* I */
  struct batch_request *preq,     /* I */
  int                   state,    /* I */
  int                   substate) /* I */

  {

  struct batch_request *momreq = 0;
  int        rc;

  momreq = cpy_stage(momreq, pjob, JOB_ATR_stagein, STAGE_DIR_IN);

  if (momreq == NULL)
    {
    /* no files to stage, go directly to sending job to mom */

    return(svr_strtjob2(pjob, preq));
    }

  /* have files to stage in */

  /* save job id for post_stagein */

  momreq->rq_extra = malloc(PBS_MAXSVRJOBID + 1);

  if (momreq->rq_extra == 0)
    {
    return(PBSE_SYSTEM);
    }

  strcpy(momreq->rq_extra, pjob->ji_qs.ji_jobid);

  rc = relay_to_mom(
         pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
         momreq,
         post_stagein);

  if (rc == 0)
    {
    svr_setjobstate(pjob, state, substate);

    /*
     * stage-in started ok - reply to client as copy may
     * take too long to wait.
     */

    if (preq != NULL)
      reply_ack(preq);
    }
  else
    {
    free(momreq->rq_extra);
    }

  return(rc);
  }  /* END svr_stagein() */
Exemple #4
0
void req_checkpointjob(

  struct batch_request *preq)

  {
  job    *pjob;
  int     rc;
  attribute *pattr;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return;
    }

  if (is_cloud_job(pjob))
    {
    rc = PBSE_CLOUD_REQUEST;
    req_reject(rc, 0, preq, NULL, "cloud jobs cannot be checkpointed");
    }

  pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                           preq, process_checkpoint_reply)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
      job_save(pjob, SAVEJOB_QUICK);
      LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid, log_buffer);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }
  }  /* END req_checkpointjob() */
int issue_signal(

  job  **pjob_ptr,
  char  *signame, /* name of the signal to send */
  void  (*func)(batch_request *),
  void  *extra) /* extra parameter to be stored in sig request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;

  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, 0);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */
void remove_stagein(

  job **pjob_ptr)  /* I */

  {

  struct batch_request *preq = 0;
  job                  *pjob = *pjob_ptr;
  u_long                addr;

  preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0);

  if (preq != NULL)
    {
    /* have files to delete  */

    /* change the request type from copy to delete  */

    preq->rq_type = PBS_BATCH_DelFiles;

    preq->rq_extra = NULL;

    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_momport;

    /* The preq is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    if (relay_to_mom(&pjob, preq, NULL) == PBSE_NONE)
      {
      if (pjob != NULL)
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
      }
    else
      {
      /* log that we were unable to remove the files */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_FILE,
        pjob->ji_qs.ji_jobid,
        "unable to remove staged in files for job");
      }

    free_br(preq);
    }

  return;
  }  /* END remove_stagein() */
Exemple #7
0
void remove_stagein(

  job *pjob)  /* I */

  {

  struct batch_request *preq = 0;
  u_long addr;

  preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0);

  if (preq != NULL)
    {
    /* have files to delete  */

    /* change the request type from copy to delete  */

    preq->rq_type = PBS_BATCH_DelFiles;

    preq->rq_extra = NULL;

    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_momport;

    if (relay_to_mom(
          pjob,
          preq,
          release_req) == 0)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
      }
    else
      {
      /* log that we were unable to remove the files */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_FILE,
        pjob->ji_qs.ji_jobid,
        "unable to remove staged in files for job");

      free_br(preq);
      }
    }

  return;
  }  /* END remove_stagein() */
void *req_messagejob(
    
  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
    return(NULL);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    
    return(NULL);
    }

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
  /* pass the request on to MOM */
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
    {
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    free_br(dup_req);
    }
  else
    {
    post_message_req(dup_req);
    free_br(preq);
    }

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);

  return(NULL);
  } /* END req_messagejob() */
Exemple #9
0
int
shutdown_preempt_chkpt(job *pjob)
{
	struct batch_request *phold;
	attribute temp;
	void (*func)(struct work_task *);

	long *hold_val = NULL;
	long old_hold = 0;

	phold = alloc_br(PBS_BATCH_HoldJob);
	if (phold == NULL)
		return (PBSE_SYSTEM);

	temp.at_flags = ATR_VFLAG_SET;
	temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
	temp.at_user_encoded = NULL;
	temp.at_priv_encoded = NULL;
	temp.at_val.at_long = HOLD_s;

	phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;
	(void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);
	CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);
	if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp,
		&phold->rq_ind.rq_hold.rq_orig.rq_attr,
		job_attr_def[(int)JOB_ATR_hold].at_name,
		NULL,
		ATR_ENCODE_CLIENT, NULL) < 0)
		return (PBSE_SYSTEM);

	phold->rq_extra = pjob;
	func = post_chkpt;

	if (relay_to_mom(pjob, phold, func) == 0) {

		if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
			svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);
		pjob->ji_qs.ji_svrflags |= (JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT | JOB_SVFLG_HASHOLD);
		pjob->ji_modified = 1;
		(void)job_save(pjob, SAVEJOB_QUICK);
		return (0);
	} else {
		*hold_val = old_hold;	/* reset to the old value */
		return (-1);
	}
}
Exemple #10
0
void *req_messagejob(

    void *vp)

{
    struct batch_request *preq = (struct batch_request *)vp;
    job                  *pjob;
    int                   rc;
    struct batch_request *dup_req = NULL;

    if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
        return(NULL);

    /* the job must be running */

    if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
        req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return(NULL);
    }

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
        req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
    /* pass the request on to MOM */
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0)
        req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    else
        free_br(preq);

    /* After MOM acts and replies to us, we pick up in post_message_req() */
    if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    return(NULL);
} /* END req_messagejob() */
Exemple #11
0
static int
shutdown_chkpt(job *pjob)
{
	struct batch_request *phold;
	attribute 	      temp;

	phold = alloc_br(PBS_BATCH_HoldJob);
	if (phold == (struct batch_request *)0)
		return (PBSE_SYSTEM);

	temp.at_flags = ATR_VFLAG_SET;
	temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
	temp.at_user_encoded = NULL;
	temp.at_priv_encoded = NULL;
	temp.at_val.at_long = HOLD_s;

	phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;
	(void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);
	CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);
	if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp,
		&phold->rq_ind.rq_hold.rq_orig.rq_attr,
		job_attr_def[(int)JOB_ATR_hold].at_name,
		(char *)0,
		ATR_ENCODE_CLIENT, NULL) < 0)
		return (PBSE_SYSTEM);

	if (relay_to_mom(pjob, phold, post_chkpt) == 0) {

		if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
			svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);

		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHKPT;
		pjob->ji_modified = 1;
		(void)job_save(pjob, SAVEJOB_QUICK);
		return (0);
	} else
		return (-1);
}
Exemple #12
0
int issue_signal(

  job  *pjob,
  char *signame, /* name of the signal to send */
  void (*func)(struct work_task *),
  void *extra) /* extra parameter to be stored in sig request */

  {
  int rc;

  struct batch_request *newreq;

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;

  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  strncpy(newreq->rq_ind.rq_signal.rq_signame, signame, PBS_SIGNAMESZ);

  rc = relay_to_mom(
         pjob,
         newreq,
         func);

  /* when MOM replies, we just free the request structure */

  return(rc);
  }  /* END issue_signal() */
Exemple #13
0
void mom_cleanup_checkpoint_hold(

  struct work_task *ptask)

  {
  int                   rc = 0;
  job                  *pjob;
  char                 *jobid;

  struct batch_request *preq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  time_t                time_now = time(NULL);

  jobid = (char *)ptask->wt_parm1;
  free(ptask->wt_mutex);
  free(ptask);

  if (jobid == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return;
    }

  pjob = svr_find_job(jobid, FALSE);
  if (pjob == NULL)
    {
    if (LOGLEVEL >= 3)
      {
      sprintf(log_buf,
        "%s:failed to find job\n",
        __func__);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobid,log_buf);
      }
    free(jobid);
    return;
    }
  free(jobid);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "checking mom cleanup job state is %s-%s\n",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  /* 
   * if the job is no longer running then we have recieved the job obit
   * and need to request the mom to clean up after the job
   */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    if ((preq = alloc_br(PBS_BATCH_DeleteJob)) == NULL)
      {
      log_err(-1, __func__, "unable to allocate DeleteJob request - big trouble!");
      }
    else
      {
      strcpy(preq->rq_ind.rq_delete.rq_objname, pjob->ji_qs.ji_jobid);
      /* The preq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if ((rc = relay_to_mom(&pjob, preq, release_req)) != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }

        return;
        }

      if ((LOGLEVEL >= 7) &&
          (pjob != NULL))
        {
        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          "requested mom cleanup");
        }
      }
    }
  else
    {
    set_task(WORK_Timed, time_now + 1, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }

  if (pjob != NULL)
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  } /* END mom_cleanup_checkpoint_hold() */
Exemple #14
0
void
req_modifyjob(struct batch_request *preq)
{
	int		 add_to_am_list = 0; /* if altered during sched cycle */
	int		 bad = 0;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsubstate;
	resource_def	*outsideselect = NULL;
	job		*pjob;
	svrattrl	*plist;
	resource	*presc;
	resource_def	*prsd;
	int		 rc;
	int		 running = 0;
	int		 sendmom = 0;
	char		hook_msg[HOOK_MSG_SIZE];
	int		mod_project = 0;
	pbs_sched	*psched;

	switch (process_hooks(preq, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(preq, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(preq) == -1) { /* error */
				/* we have to reject the request, as 'preq' */
				/* may have been partly modified            */
				strcpy(hook_msg,
					"modifyjob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(preq, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "modifyjob event: accept req by default");
	}

	if (pseldef == NULL)  /* do one time to keep handy */
		pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size);

	pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt);
	if (pjob == NULL)
		return;

	if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	psched = find_sched_from_sock(preq->rq_conn);
	/* allow scheduler to modify job */
	if (psched == NULL) {
		/* provisioning job is not allowed to be modified */
		if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
			(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}

	/* cannot be in exiting or transit, exiting has already be checked */

	if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	if (plist == NULL) {	/* nothing to do */
		reply_ack(preq);
		return;
	}

	/*
	 * Special checks must be made:
	 *	if during a scheduling cycle and certain attributes are altered,
	 *	   make a note of the job to prevent it from being run now;
	 *	if job is running, only certain attributes/resources can be
	 *	   altered.
	 */

	if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {
		running = 1;
	}
	while (plist) {
		int i;

		i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

		/*
		 * Is the attribute being altered one which could change
		 * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling
		 * cycle is in progress, then set flag to add the job to list
		 * of jobs which cannot be run in this cycle.
		 * If the scheduler itself sends a modify job request,
		 * no need to delay the job until next cycle.
		 */
		if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT))
			add_to_am_list = 1;

		/* Is the attribute modifiable in RUN state ? */

		if (i < 0) {
			reply_badattr(PBSE_NOATTR, 1, plist, preq);
			return;
		}
		if ((running == 1) &&
			((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) {

			reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
			return;
		}
		if (i == (int)JOB_ATR_resource) {

			prsd = find_resc_def(svr_resc_def, plist->al_resc,
				svr_resc_size);

			if (prsd == 0) {
				reply_badattr(PBSE_UNKRESC, 1, plist, preq);
				return;
			}

			/* is the specified resource modifiable while */
			/* the job is running                         */

			if (running) {

				if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) {
					reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
					return;
				}

				sendmom = 1;
			}

			/* should the resource be only in a select spec */

			if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect &&
				plist->al_atopl.value && plist->al_atopl.value[0]) {
				/* if "-lresource" is set and has non-NULL value,
				** remember as potential bad resource
				** if this appears along "select".
				*/
				outsideselect = prsd;
			}
		}
		if (strcmp(plist->al_name, ATTR_project) == 0) {
			mod_project = 1;
		} else if ((strcmp(plist->al_name, ATTR_runcount) == 0) &&
			((plist->al_flags & ATR_VFLAG_HOOK) == 0) &&
			(plist->al_value != NULL) &&
			(plist->al_value[0] != '\0') &&
			((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) &&
		(atol(plist->al_value) < \
		    pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) {
			sprintf(log_buffer,
				"regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld",
				preq->rq_user, preq->rq_host, ATTR_runcount,
				pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long,
				atol(plist->al_value));
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR,
				pjob->ji_qs.ji_jobid, log_buffer);
			req_reject(PBSE_PERM, 0, preq);
			return;
		}
		plist = (svrattrl *)GET_NEXT(plist->al_link);
	}

	if (outsideselect) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc &&
			((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) {
			/* select is not a default, so reject qalter */

			resc_in_err = strdup(outsideselect->rs_name);
			req_reject(PBSE_INVALJOBRESC, 0, preq);
			return;
		}

	}

	/* modify the jobs attributes */

	bad = 0;
	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);
	if (rc) {
		if (pjob->ji_clterrmsg)
			reply_text(preq, rc, pjob->ji_clterrmsg);
		else
			reply_badattr(rc, bad, plist, preq);
		return;
	}

	/* If certain attributes modified and if in scheduling cycle  */
	/* then add to list of jobs which cannot be run in this cycle */

	if (add_to_am_list)
		am_jobs_add(pjob);	/* see req_runjob() */

	/* check if project attribute was requested to be modified to */
	/* be the default project value */
	if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \
							ATR_VFLAG_SET)) {

		if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str,
			PBS_DEFAULT_PROJECT) == 0) {
			sprintf(log_buffer, msg_defproject,
				ATTR_project, PBS_DEFAULT_PROJECT);
#ifdef NAS /* localmod 107 */
			log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#else
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#endif /* localmod 107 */
		}
	}

	if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) {
			/* changing Resource_List and select is a default   */
			/* clear "select" so it is rebuilt inset_resc_deflt */
			pseldef->rs_free(&presc->rs_value);
		}
	}

	/* Reset any defaults resource limit which might have been unset */
	if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if job is not running, may need to change its state */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		(void)job_save(pjob, SAVEJOB_FULL);
	}
	(void)sprintf(log_buffer, msg_manager, msg_jobmod,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* if a resource limit changed for a running job, send to MOM */

	if (sendmom) {
		rc = relay_to_mom(pjob, preq, post_modify_req);
		if (rc)
			req_reject(rc, 0, preq);    /* unable to get to MOM */
		return;
	}

	reply_ack(preq);
}
Exemple #15
0
static int shutdown_checkpoint(

  job **pjob_ptr)

  {
  job                  *pjob = *pjob_ptr;
  struct batch_request *phold;
  pbs_attribute         temp;
  char                  jobid[PBS_MAXSVRJOBID + 1];
  int                   rc = PBSE_NONE;

  phold = alloc_br(PBS_BATCH_HoldJob);

  if (phold == NULL)
    {
    return(PBSE_SYSTEM);
    }

  temp.at_flags = ATR_VFLAG_SET;

  temp.at_type  = job_attr_def[JOB_ATR_hold].at_type;
  temp.at_val.at_long = HOLD_s;

  phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

  strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);

  CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);

  if (job_attr_def[JOB_ATR_hold].at_encode(
        &temp,
        &phold->rq_ind.rq_hold.rq_orig.rq_attr,
        job_attr_def[JOB_ATR_hold].at_name,
        NULL,
        ATR_ENCODE_CLIENT,
        0) < 0)
    {
    free_br(phold);
    return(PBSE_SYSTEM);
    }

  /* The phold is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  if ((rc = relay_to_mom(&pjob, phold, NULL)) != PBSE_NONE)
    {
    /* FAILURE */
    free_br(phold);

    return(-1);
    }
    
  jobid[0] = '\0';

  if (pjob != NULL)
    {
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
    
    if (LOGLEVEL >= 1)
      {
      log_event(
        PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "shutting down with active checkpointable job");
      }
  
    job_save(pjob, SAVEJOB_QUICK, 0);
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }
  
  if (rc == PBSE_NONE)
    {
    post_checkpoint(phold);

    if (jobid[0] != '\0')
     *pjob_ptr = svr_find_job(jobid, TRUE);
    }

  return(PBSE_NONE);
  }  /* END shutdown_checkpoint() */
Exemple #16
0
int req_signaljob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    char ipstr[128];

    sprintf(log_buf, "relaying signal request to mom %s", netaddr_long(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,ipstr));

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    /* reply_ack will free preq. We need to copy it before we call reply_ack */
    batch_request *new_preq;

    new_preq = duplicate_request(preq, -1);
    if (new_preq == NULL)
      {
      sprintf(log_buf, "failed to duplicate batch request");
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(PBSE_MEM_MALLOC);
      }

    get_batch_request_id(new_preq);

    reply_ack(new_preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */
  if ((dup_req = duplicate_request(preq)) == NULL)
    {
    req_reject(PBSE_SYSTEM, 0, preq, NULL, "can not allocate memory");
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      job_mutex.unlock();
    else
      job_mutex.set_unlock_on_exit(false);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
Exemple #17
0
void
req_py_spawn(struct batch_request *preq)
{
	int             jt;		/* job type */
	job		*pjob;
	int		rc;
	char		*jid = preq->rq_ind.rq_py_spawn.rq_jid;
	int		i, offset;

	/*
	 ** Returns job pointer for singleton job or "parent" of
	 ** an array job.
	 */
	pjob = chk_job_request(jid, preq, &jt);
	if (pjob == NULL)
		return;

	/* see if requestor is the job owner */
	if (svr_chk_owner(preq, pjob) != 0) {
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	if (jt == IS_ARRAY_NO) {		/* a regular job is okay */
		/* the job must be running */
		if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) ||
			(pjob->ji_qs.ji_substate !=
			JOB_SUBSTATE_RUNNING)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}
	else if (jt == IS_ARRAY_Single) {	/* a single subjob is okay */

		offset = subjob_index_to_offset(pjob,
			get_index_from_jid(jid));
		if (offset == -1) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}

		i = get_subjob_state(pjob, offset);
		if (i == -1) {
			req_reject(PBSE_IVALREQ, 0, preq);
			return;
		}

		if (i != JOB_STATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
		if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	} else {
		reply_text(preq, PBSE_NOSUP,
			"not supported for Array Jobs or multiple sub-jobs");
		return;
	}

	/*
	 ** Pass the request on to MOM.  If this works, the function
	 ** post_py_spawn_req will be called to handle the reply.
	 ** If it fails, send the reply now.
	 */
	rc = relay_to_mom(pjob, preq, post_py_spawn_req);
	if (rc)
		req_reject(rc, 0, preq);	/* unable to get to MOM */
}
Exemple #18
0
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
/*
 * modify_whole_array()
 * modifies the entire job array 
 * @SEE req_modify_array PARENT
 */ 
int modify_whole_array(

  job_array *pa,              /* I/O */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char id[] = "modify_whole_array";
  int i;
  int rc = 0;
  int mom_relay = 0;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->jobs[i] == NULL)
      continue;

    /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
    rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY);

    if(rc == PBSE_RELAYED_TO_MOM)
      {
      struct batch_request *array_req = NULL;

      /* We told modify_job not to call relay_to_mom so we need to contact the mom */
      rc = copy_batchrequest(&array_req, preq, 0, i);
      if(rc != 0)
        {
        return(rc);
        }

      preq->rq_refcount++;
      if(mom_relay == 0)
        {
        preq->rq_refcount++;
        }
      mom_relay++;
      if ((rc = relay_to_mom(
                  pa->jobs[i]->ji_qs.ji_un.ji_exect.ji_momaddr,
                  array_req,
                  post_modify_arrayreq)))
        {  
        snprintf(log_buffer,sizeof(log_buffer),
          "Unable to relay information to mom for job '%s'\n",
          pa->jobs[i]->ji_qs.ji_jobid);
        log_err(rc,id,log_buffer);

        return(rc); /* unable to get to MOM */
        }

      }
    }

  if(mom_relay)
    {
    preq->rq_refcount--;
    if(preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(rc);
  } /* END modify_whole_array() */
Exemple #20
0
void req_signaljob(

  struct batch_request *preq)  /* I */

  {
  job *pjob;
  int  rc;

  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return;
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    return;
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */

      req_reject(PBSE_PERM, 0, preq, NULL, NULL);

      return;
      }

    preq->rq_extra = pjob;  /* save job ptr for post_signal_req() */
    }

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    return;
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer, "relaying signal request to mom %lu",
            pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(
      PBSEVENT_SCHED,
      PBS_EVENTCLASS_REQUEST,
      "req_signaljob",
      log_buffer);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    }

  /* pass the request on to MOM */

  if ((rc = relay_to_mom(
              pjob,
              preq,
              post_signal_req)))
    {
    req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */

    return;
    }

  /* After MOM acts and replies to us, we pick up in post_signal_req() */

  /* SUCCESS */

  return;
  }  /* END req_signaljob() */
Exemple #21
0
void
req_holdjob(struct batch_request *preq)
{
	long		*hold_val;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsub;
	long		 old_hold;
	job		*pjob;
	char		*pset;
	int		 rc;
	char             date[32];
	time_t           now;


	pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq, &jt);
	if (pjob == (job *)0)
		return;
	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}
	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}


	/* cannot do anything until we decode the holds to be set */

	if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if other than HOLD_u is being set, must have privil */

	if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* HOLD_bad_password can only be done by root or admin */
#ifdef WIN32
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
				!isAdminPrivilege(preq->rq_user) )
#else
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
		  strcasecmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0 )
#endif
	{
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;
	old_hold = *hold_val;
	*hold_val |= temphold.at_val.at_long;
	pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODCACHE;

	/* Note the hold time in the job comment. */
	now = time(NULL);
	(void)strncpy(date, (const char *)ctime(&now), 24);
	date[24] = '\0';
	(void)sprintf(log_buffer, "Job held by %s on %s", preq->rq_user, date);
	job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, log_buffer);

	(void)sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
		preq->rq_host);

	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate != JOB_SUBSTATE_PRERUN) &&
		(pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str) &&
		(*pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str != 'n')) {

		/* have MOM attempt checkpointing */

		if ((rc = relay_to_mom(pjob, preq, post_hold)) != 0) {
			*hold_val = old_hold;	/* reset to the old value */
			req_reject(rc, 0, preq);
		} else {
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
			pjob->ji_qs.ji_svrflags |=
				JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT;
			(void)job_save(pjob, SAVEJOB_QUICK);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
		}
	} else {

		/* every thing went well, may need to update the job state */

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
			pjob->ji_qs.ji_jobid, log_buffer);
		if (old_hold != *hold_val) {
			/* indicate attributes changed     */
			pjob->ji_modified = 1;
			svr_evaljobstate(pjob, &newstate, &newsub, 0);
			(void)svr_setjobstate(pjob, newstate, newsub);
		}
		reply_ack(preq);
	}
}
Exemple #22
0
int issue_signal(

  job        **pjob_ptr,
  const char  *signame, /* name of the signal to send */
  void       (*func)(struct batch_request *),
  void        *extra, /* extra parameter to be stored in sig request */
  char        *extend) /* Parameter to put in extended part of request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;
  newreq->rq_extend = extend;
  if (extend != NULL)
    {
    newreq->rq_extsz = strlen(extend);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else if ((extend != NULL) && 
      (!strcmp(extend, RERUNFORCE)))
    {
    if (pjob == NULL)
      {
      *pjob_ptr = svr_find_job((char *)jobid, TRUE);
      pjob = *pjob_ptr;
      }
    /* The job state is normally set when the obit arrives. But since the 
       MOM is not responding we need to set the state here */

    if (pjob != NULL)
      {
      /* Rerunning job, if not checkpointed, clear "resources_used and requeue job */
      if ((pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) == 0)
        {
        job_attr_def[JOB_ATR_resc_used].at_free(&pjob->ji_wattr[JOB_ATR_resc_used]);
        }
      else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
        {
        /* non-migratable checkpoint (cray), leave there */
        /* and just requeue the job         */

        rel_resc(pjob);

        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;

        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);

        pjob->ji_momhandle = -1;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);

        return(PBSE_SYSTEM);
        }

      rel_resc(pjob); /* free resc assigned to job */

      /* Now re-queue the job */
      pjob->ji_modified = 1; /* force full job save */

      pjob->ji_momhandle = -1;
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
      func(newreq);

      rc = PBSE_NONE;
      }
    else
      rc = PBSE_JOBNOTFOUND;
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */
Exemple #23
0
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */
Exemple #24
0
void *req_checkpointjob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  pbs_attribute *pattr;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory");
      }

    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        pjob = NULL;
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  return(NULL);
  }  /* END req_checkpointjob() */
Exemple #25
0
/*
 * modify_whole_array()
 * modifies the entire job array 
 * @SEE req_modify_array PARENT
 */ 
int modify_whole_array(

  job_array *pa,              /* I/O */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  int   i;
  int   rc = 0;
  int   mom_relay = 0;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);

      if (rc == PBSE_RELAYED_TO_MOM)
        {
        struct batch_request *array_req = NULL;
        /* We told modify_job not to call relay_to_mom
         * so we need to contact the mom */
        rc = copy_batchrequest(&array_req, preq, 0, i);
        if (rc != 0)
          {
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          return(rc);
          }

        preq->rq_refcount++;
        if (mom_relay == 0)
          {
          preq->rq_refcount++;
          }
        mom_relay++;
        /* The array_req is freed in relay_to_mom (failure)
         * or in issue_Drequest (success) */
        if ((rc = relay_to_mom(&pjob, array_req, post_modify_arrayreq)))
          {
          if (pjob != NULL)
            {
            snprintf(log_buf,sizeof(log_buf),
              "Unable to relay information to mom for job '%s'\n",
              pjob->ji_qs.ji_jobid);
            log_err(rc, __func__, log_buf);
            unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }

          return(rc); /* unable to get to MOM */
          }
        }

      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
      }
    } /* END foreach job in array */

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(rc);
  } /* END modify_whole_array() */
Exemple #26
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char id[] = "modify_array_range";
  tlist_head tl;
  int i, rc;
  int mom_relay = 0;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->jobs[i] == NULL))
          continue;
        
        rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY);

        if (rc == PBSE_RELAYED_TO_MOM)
          {
          struct batch_request *array_req = NULL;
          
          /* We told modify_job not to call relay_to_mom so we need to contact the mom */
          rc = copy_batchrequest(&array_req, preq, 0, i);
          if (rc != 0)
            {
            return(rc);
            }
          
          preq->rq_refcount++;
          if (mom_relay == 0)
            {
            preq->rq_refcount++;
            }
          mom_relay++;
          if ((rc = relay_to_mom(
                      pa->jobs[i],
                      array_req,
                      post_modify_arrayreq)))
            {  
            snprintf(log_buffer,sizeof(log_buffer),
              "Unable to relay information to mom for job '%s'\n",
              pa->jobs[i]->ji_qs.ji_jobid);
            log_err(rc,id,log_buffer);
          
            return(rc); /* unable to get to MOM */
            }
        
          }  
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */
Exemple #27
0
void req_holdjob(

  struct batch_request *preq)

  {
  long  *hold_val;
  int   newstate;
  int   newsub;
  long   old_hold;
  job    *pjob;
  char    *pset;
  int     rc;
  attribute temphold;
  attribute *pattr;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return;
    }

  if (is_cloud_job(pjob))
    {
    req_reject(PBSE_CLOUD_REQUEST,0,preq,NULL,NULL);
    }

  /* cannot do anything until we decode the holds to be set */

  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);
    return;
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);
    return;
    }

  hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
          preq->rq_host);

  pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                           preq, process_hold_reply)) != 0)
      {
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pjob->ji_qs.ji_svrflags |=
        JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
      job_save(pjob, SAVEJOB_QUICK);
      
      /* fill in log_buffer again, since relay_to_mom changed it */
      
      sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
          preq->rq_host);
          
      LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid, log_buffer);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
        "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */

      pjob->ji_modified = 1;

      svr_evaljobstate(pjob, &newstate, &newsub, 0);

      svr_setjobstate(pjob, newstate, newsub);
      }

    reply_ack(preq);
    }
  }  /* END req_holdjob() */
int req_signaljob(

  void *vp)  /* I */

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);
    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(rc, 0, preq, NULL, "can not allocate memory");
    unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
Exemple #29
0
static int shutdown_checkpoint(

  job *pjob)

  {

  struct batch_request *phold;
  attribute        temp;

  phold = alloc_br(PBS_BATCH_HoldJob);

  if (phold == NULL)
    {
    return(PBSE_SYSTEM);
    }

  temp.at_flags = ATR_VFLAG_SET;

  temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
  temp.at_val.at_long = HOLD_s;

  phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;

  strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);

  CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);

  if (job_attr_def[(int)JOB_ATR_hold].at_encode(
        &temp,
        &phold->rq_ind.rq_hold.rq_orig.rq_attr,
        job_attr_def[(int)JOB_ATR_hold].at_name,
        NULL,
        ATR_ENCODE_CLIENT) < 0)
    {
    return(PBSE_SYSTEM);
    }

  if (relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, phold, post_checkpoint) != 0)
    {
    /* FAILURE */

    return(-1);
    }

  pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

  pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;

  if (LOGLEVEL >= 1)
    {
    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      "shutting down with active checkpointable job");
    }

  job_save(pjob, SAVEJOB_QUICK);

  return(0);
  }  /* END shutdown_checkpoint() */
Exemple #30
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  tlist_head          tl;
  int                 i;
  int                 rc;
  int                 mom_relay = 0;
  job                *pjob;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->job_ids[i] == NULL))
          continue;

        if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          pthread_mutex_unlock(pa->ai_mutex);
          rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);
          pa = get_jobs_array(&pjob);
          
          if (pjob != NULL)
            {
            if (rc == PBSE_RELAYED_TO_MOM)
              {
              struct batch_request *array_req = NULL;
              
              /* We told modify_job not to call relay_to_mom so we need to contact the mom */
              if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE)
                {
                return(rc);
                }
              
              preq->rq_refcount++;
              if (mom_relay == 0)
                {
                preq->rq_refcount++;
                }
              mom_relay++;
              
              /* The array_req is freed in relay_to_mom (failure)
               * or in issue_Drequest (success) */
              
              if ((rc = relay_to_mom(&pjob, array_req, NULL)))
                {
                snprintf(log_buf,sizeof(log_buf),
                  "Unable to relay information to mom for job '%s'\n",
                  pjob->ji_qs.ji_jobid);
                log_err(rc, __func__, log_buf);
                
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                
                return(rc); /* unable to get to MOM */
                }
              else
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                post_modify_arrayreq(array_req);
                }
              }
            else
              unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }
          else
            pa->job_ids[i] = NULL;

          }
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */