Example #1
0
void
req_messagejob(struct batch_request *preq)
  {
  job   *pjob;
  int    rc;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == 0)
    return;

  if (is_cloud_job(pjob))
    {
    rc = PBSE_CLOUD_REQUEST;
    req_reject(rc, 0, preq, NULL, NULL);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    return;
    }

  /* pass the request on to MOM */

  if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                         preq, post_message_req)))
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  }
Example #2
0
void
req_messagejob(struct batch_request *preq)
{
	int               jt;            /* job type */
	job		 *pjob;
	int		  rc;

	if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq, &jt)) == 0)
		return;

	if (jt != IS_ARRAY_NO) {
		reply_text(preq, PBSE_NOSUP, "not supported for Array Jobs");
		return;
	}

	/* the job must be running */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	/* pass the request on to MOM */

	rc = relay_to_mom(pjob, preq, post_message_req);
	if (rc)
		req_reject(rc, 0, preq);	/* unable to get to MOM */

	/* After MOM acts and replies to us, we pick up in post_message_req() */
}
int req_releasejob(

  void *vp) /* ptr to the decoded request   */

  {
  job  *pjob;
  int   rc;
  struct batch_request *preq = (struct batch_request *)vp; 

  pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  if ((rc = release_job(preq,pjob)) != 0)
    {
    req_reject(rc,0,preq,NULL,NULL);
    }
  else
    {
    reply_ack(preq);
    }

  unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

  return(PBSE_NONE);
  }  /* END req_releasejob() */
Example #4
0
int req_releasejob(

  batch_request *vp) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *preq = (batch_request *)vp;

  pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((rc = release_job(preq, pjob, NULL)) != 0)
    {
    req_reject(rc,0,preq,NULL,NULL);
    }
  else
    {
    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_releasejob() */
Example #5
0
void *req_modifyjob(

  batch_request *preq) /* I */

  {
  job       *pjob;
  svrattrl  *plist;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);

  if (pjob == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if (plist == NULL)
    {
    /* nothing to do */
    reply_ack(preq);

    /* SUCCESS */
    return(NULL);
    }

  job_mutex.unlock();

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    /* reply_ack will free preq. We need to copy it before we call reply_ack */
    batch_request *new_preq;

    new_preq = duplicate_request(preq, -1);
    if (new_preq == NULL)
      {
      sprintf(log_buf, "failed to duplicate batch request");
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(NULL);
      }

    get_batch_request_id(new_preq);
    reply_ack(preq);

    new_preq->rq_noreply = TRUE; /* set for no more replies */

    enqueue_threadpool_request((void *(*)(void *))modify_job_work, new_preq);
    } 
  else
    modify_job_work(preq);
  
  return(NULL);
  }  /* END req_modifyjob() */
Example #6
0
void req_checkpointjob(

  struct batch_request *preq)

  {
  job    *pjob;
  int     rc;
  attribute *pattr;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return;
    }

  if (is_cloud_job(pjob))
    {
    rc = PBSE_CLOUD_REQUEST;
    req_reject(rc, 0, preq, NULL, "cloud jobs cannot be checkpointed");
    }

  pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                           preq, process_checkpoint_reply)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
      job_save(pjob, SAVEJOB_QUICK);
      LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid, log_buffer);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }
  }  /* END req_checkpointjob() */
Example #7
0
void *req_messagejob(
    
  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
    return(NULL);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    
    return(NULL);
    }

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
  /* pass the request on to MOM */
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
    {
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    free_br(dup_req);
    }
  else
    {
    post_message_req(dup_req);
    free_br(preq);
    }

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);

  return(NULL);
  } /* END req_messagejob() */
Example #8
0
void *req_modifyjob(

  batch_request *preq) /* I */

  {
  job       *pjob;
  svrattrl  *plist;

  pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);

  if (pjob == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if (plist == NULL)
    {
    /* nothing to do */
    reply_ack(preq);

    /* SUCCESS */
    return(NULL);
    }

  job_mutex.unlock();

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    reply_ack(preq);

    preq->rq_noreply = TRUE; /* set for no more replies */

    enqueue_threadpool_request((void *(*)(void *))modify_job_work, preq);
    }
  else
    modify_job_work(preq);
  
  return(NULL);
  }  /* END req_modifyjob() */
Example #9
0
void *req_messagejob(

    void *vp)

{
    struct batch_request *preq = (struct batch_request *)vp;
    job                  *pjob;
    int                   rc;
    struct batch_request *dup_req = NULL;

    if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
        return(NULL);

    /* the job must be running */

    if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
        req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return(NULL);
    }

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
        req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
    /* pass the request on to MOM */
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0)
        req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    else
        free_br(preq);

    /* After MOM acts and replies to us, we pick up in post_message_req() */
    if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    return(NULL);
} /* END req_messagejob() */
Example #10
0
void *req_modifyjob(

  void *vp) /* I */

  {
  job  *pjob;
  svrattrl *plist;
  int   rc;
  int   checkpoint_req = FALSE;
  struct batch_request *preq = (struct batch_request *)vp;

  pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);

  if (pjob == NULL)
    {
    return(NULL);
    }

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if (plist == NULL)
    {
    /* nothing to do */

    reply_ack(preq);

    /* SUCCESS */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(NULL);
    }

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    reply_ack(preq);

    preq->rq_noreply = TRUE; /* set for no more replies */
    }

  /* pbs_mom sets the extend string to trigger copying of checkpoint files */

  if (preq->rq_extend != NULL)
    {
    if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0)
      {
      checkpoint_req = CHK_HOLD;
      }
    else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0)
      {
      checkpoint_req = CHK_CONT;
      }
    }

  if ((rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, 0)) != 0)
    {
    if ((rc == PBSE_MODATRRUN) ||
        (rc == PBSE_UNKRESC))
      reply_badattr(rc,1,plist,preq);
    else if ( rc == PBSE_RELAYED_TO_MOM )
      {
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
      
      return(NULL);
      }
    else
      req_reject(rc,0,preq,NULL,NULL);
    }
  else
    reply_ack(preq);

  unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
  
  return(NULL);
  }  /* END req_modifyjob() */
Example #11
0
void
req_modifyjob(struct batch_request *preq)
{
	int		 add_to_am_list = 0; /* if altered during sched cycle */
	int		 bad = 0;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsubstate;
	resource_def	*outsideselect = NULL;
	job		*pjob;
	svrattrl	*plist;
	resource	*presc;
	resource_def	*prsd;
	int		 rc;
	int		 running = 0;
	int		 sendmom = 0;
	char		hook_msg[HOOK_MSG_SIZE];
	int		mod_project = 0;
	pbs_sched	*psched;

	switch (process_hooks(preq, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(preq, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(preq) == -1) { /* error */
				/* we have to reject the request, as 'preq' */
				/* may have been partly modified            */
				strcpy(hook_msg,
					"modifyjob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(preq, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "modifyjob event: accept req by default");
	}

	if (pseldef == NULL)  /* do one time to keep handy */
		pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size);

	pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt);
	if (pjob == NULL)
		return;

	if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	psched = find_sched_from_sock(preq->rq_conn);
	/* allow scheduler to modify job */
	if (psched == NULL) {
		/* provisioning job is not allowed to be modified */
		if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
			(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}

	/* cannot be in exiting or transit, exiting has already be checked */

	if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	if (plist == NULL) {	/* nothing to do */
		reply_ack(preq);
		return;
	}

	/*
	 * Special checks must be made:
	 *	if during a scheduling cycle and certain attributes are altered,
	 *	   make a note of the job to prevent it from being run now;
	 *	if job is running, only certain attributes/resources can be
	 *	   altered.
	 */

	if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {
		running = 1;
	}
	while (plist) {
		int i;

		i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

		/*
		 * Is the attribute being altered one which could change
		 * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling
		 * cycle is in progress, then set flag to add the job to list
		 * of jobs which cannot be run in this cycle.
		 * If the scheduler itself sends a modify job request,
		 * no need to delay the job until next cycle.
		 */
		if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT))
			add_to_am_list = 1;

		/* Is the attribute modifiable in RUN state ? */

		if (i < 0) {
			reply_badattr(PBSE_NOATTR, 1, plist, preq);
			return;
		}
		if ((running == 1) &&
			((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) {

			reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
			return;
		}
		if (i == (int)JOB_ATR_resource) {

			prsd = find_resc_def(svr_resc_def, plist->al_resc,
				svr_resc_size);

			if (prsd == 0) {
				reply_badattr(PBSE_UNKRESC, 1, plist, preq);
				return;
			}

			/* is the specified resource modifiable while */
			/* the job is running                         */

			if (running) {

				if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) {
					reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
					return;
				}

				sendmom = 1;
			}

			/* should the resource be only in a select spec */

			if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect &&
				plist->al_atopl.value && plist->al_atopl.value[0]) {
				/* if "-lresource" is set and has non-NULL value,
				** remember as potential bad resource
				** if this appears along "select".
				*/
				outsideselect = prsd;
			}
		}
		if (strcmp(plist->al_name, ATTR_project) == 0) {
			mod_project = 1;
		} else if ((strcmp(plist->al_name, ATTR_runcount) == 0) &&
			((plist->al_flags & ATR_VFLAG_HOOK) == 0) &&
			(plist->al_value != NULL) &&
			(plist->al_value[0] != '\0') &&
			((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) &&
		(atol(plist->al_value) < \
		    pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) {
			sprintf(log_buffer,
				"regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld",
				preq->rq_user, preq->rq_host, ATTR_runcount,
				pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long,
				atol(plist->al_value));
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR,
				pjob->ji_qs.ji_jobid, log_buffer);
			req_reject(PBSE_PERM, 0, preq);
			return;
		}
		plist = (svrattrl *)GET_NEXT(plist->al_link);
	}

	if (outsideselect) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc &&
			((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) {
			/* select is not a default, so reject qalter */

			resc_in_err = strdup(outsideselect->rs_name);
			req_reject(PBSE_INVALJOBRESC, 0, preq);
			return;
		}

	}

	/* modify the jobs attributes */

	bad = 0;
	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);
	if (rc) {
		if (pjob->ji_clterrmsg)
			reply_text(preq, rc, pjob->ji_clterrmsg);
		else
			reply_badattr(rc, bad, plist, preq);
		return;
	}

	/* If certain attributes modified and if in scheduling cycle  */
	/* then add to list of jobs which cannot be run in this cycle */

	if (add_to_am_list)
		am_jobs_add(pjob);	/* see req_runjob() */

	/* check if project attribute was requested to be modified to */
	/* be the default project value */
	if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \
							ATR_VFLAG_SET)) {

		if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str,
			PBS_DEFAULT_PROJECT) == 0) {
			sprintf(log_buffer, msg_defproject,
				ATTR_project, PBS_DEFAULT_PROJECT);
#ifdef NAS /* localmod 107 */
			log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#else
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#endif /* localmod 107 */
		}
	}

	if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) {
			/* changing Resource_List and select is a default   */
			/* clear "select" so it is rebuilt inset_resc_deflt */
			pseldef->rs_free(&presc->rs_value);
		}
	}

	/* Reset any defaults resource limit which might have been unset */
	if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if job is not running, may need to change its state */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		(void)job_save(pjob, SAVEJOB_FULL);
	}
	(void)sprintf(log_buffer, msg_manager, msg_jobmod,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* if a resource limit changed for a running job, send to MOM */

	if (sendmom) {
		rc = relay_to_mom(pjob, preq, post_modify_req);
		if (rc)
			req_reject(rc, 0, preq);    /* unable to get to MOM */
		return;
	}

	reply_ack(preq);
}
Example #12
0
void req_deletejob(

  struct batch_request *preq)  /* I */

  {
  job              *pjob;

  struct work_task *pwtold;

  struct work_task *pwtnew;
  struct work_task *pwtcheck;

  int               rc;
  char             *sigt = "SIGTERM";

  char             *Msg = NULL;

  /* check if we are getting a purgecomplete from scheduler */
  if ((preq->rq_extend != NULL) && 
        !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP)))
    {

    /*
     * purge_completed_jobs will respond with either an ack or reject
     */
    purge_completed_jobs(preq);

    return;
    }

  /* The way this is implemented, if the user enters the command "qdel -p <jobid>",
   * they can then delete jobs other than their own since the authorization
   * checks are made below in chk_job_request. This should probably be fixed.
   */

  if (forced_jobpurge(preq) != 0)
    {
    return;
    }

  /* NOTE:  should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */

  /* NYI */

  pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq);

  if (pjob == NULL)
    {
    /* NOTE:  chk_job_request() will issue req_reject() */

    return;
    }

  if (preq->rq_extend != NULL)
    {
    if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) &&
        strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) &&
        strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      /* have text message in request extension, add it */

      Msg = preq->rq_extend;

      /*
       * Message capability is only for operators and managers.
       * Check if request is authorized
      */

      if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
                            ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
        {
        req_reject(PBSE_PERM, 0, preq, NULL,
                   "must have operator or manager privilege to use -m parameter");
        return;
        }
      }
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */

    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);

    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        pwtnew = set_task(
                   pwtold->wt_type,
                   pwtold->wt_event,
                   post_delete_route,
                   preq);

        if (pwtnew != NULL)
          {
          /*
           * reset type in case the SIGCHLD came
           * in during the set_task;  it makes
           * sure that next_task() will find the
           * new entry.
           */

          pwtnew->wt_type = pwtold->wt_type;
          pwtnew->wt_aux = pwtold->wt_aux;

          kill((pid_t)pwtold->wt_event, SIGTERM);

          pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;

          return; /* all done for now */
          }
        else
          {
          req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

          return;
          }
        }

      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    /* should never get here ...  */

    log_err(-1, "req_delete", "Did not find work task for router");

    req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL);

    return;
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    pwtnew = set_task(
               WORK_Timed,
               time_now + 1,
               post_delete_route,
               preq);

    if (pwtnew == 0)
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buffer, "requestor=%s@%s",
          preq->rq_user,
          preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */

  account_record(PBS_ACCT_DEL, pjob, log_buffer);

  sprintf(log_buffer, msg_manager,
          msg_deletejob,
          preq->rq_user,
          preq->rq_host);

  log_event(
    PBSEVENT_JOB,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    log_buffer);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */

    strcat(log_buffer, "\n");
    strcat(log_buffer, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      !has_job_delete_nanny(pjob))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (has_job_delete_nanny(pjob))
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return;
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /* check if we are getting a asynchronous delete */

    if ((preq->rq_extend != NULL) &&
          !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC)))
      {
      struct batch_request *preq_tmp = NULL;
      /*
       * Respond with an ack now instead of after MOM processing
       * Create a new batch request and fill it in. It will be freed by reply_ack
       */

      snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously");
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer);

      preq_tmp = alloc_br(PBS_BATCH_DeleteJob);
      preq_tmp->rq_perm = preq->rq_perm;
      preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd;
      preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype;
      preq_tmp->rq_fromsvr = preq->rq_fromsvr;
      preq_tmp->rq_extsz = preq->rq_extsz;
      preq_tmp->rq_conn = preq->rq_conn;
      memcpy(preq_tmp->rq_ind.rq_manager.rq_objname,
          preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1);
      memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1);
      memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1);

      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      }
  
    /* make a cleanup task if set */
    if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
      {
      pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
      if (pwtcheck != NULL)
        append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
      }

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */

    if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq)))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */

    sprintf(log_buffer, msg_delrunjobsig,
            sigt);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    return;
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
      (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
    {
    pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
    if (pwtcheck != NULL)
      append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = pjob->ji_arraystruct;

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->jobs[i] == NULL)
          continue;

        tmp = (job *)pa->jobs[i];

        if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
          {
          tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
              
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
            }
          
          svr_evaljobstate(tmp, &newstate, &newsub, 1);
          svr_setjobstate(tmp, newstate, newsub);
          job_save(tmp, SAVEJOB_FULL, 0);

          break;
          }
        }
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;

    /* force new connection */

    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);

    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(pjob);

    job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);

    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }

    KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);

    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  reply_ack(preq);

  return;
  }  /* END req_deletejob() */
Example #13
0
void delay_and_send_sig_kill(
    
  batch_request *preq_sig)

  {
  int                   delay = 0;
  job                  *pjob;

  pbs_queue            *pque;

  batch_request        *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq_sig == NULL)
    return;

  rc = preq_sig->rq_reply.brp_code;

  if (preq_sig->rq_extend != NULL)
    {
    preq_clt = get_remove_batch_request(preq_sig->rq_extend);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL)
    {
    /* job has gone away, chk_job_request() calls req_reject() on failure */
    return;
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during rerun");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      pjob_mutex.unlock();
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  // Apply the user delay first so it takes precedence.
  if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
    delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
    mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

    if (delay == 0)
      {
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             0);
      }
    }
  else
    {
    /* why is the pque null. Something went wrong */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
    req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf);
    return;
    }

  pjob_mutex.unlock();
  reply_ack(preq_clt);
  set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE);
  } // END delay_and_send_sig_kill()
Example #14
0
int req_movejob(

  batch_request *req) /* I */

  {
  job       *jobp;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  int        local_errno = 0;

  jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req);

  if (jobp == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(jobp->ji_mutex, true);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", jobp->ji_qs.ji_jobid);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  
  if ((jobp->ji_qs.ji_state != JOB_STATE_QUEUED) &&
      (jobp->ji_qs.ji_state != JOB_STATE_HELD) &&
      (jobp->ji_qs.ji_state != JOB_STATE_WAITING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d %s", pbse_to_txt(PBSE_BADSTATE), jobp->ji_qs.ji_state, __func__);

    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }

  /*
   * svr_movejob() does the real work, handles both local and
   * network moves
   */
  
  /* We have found that sometimes the destination queue and the 
     parent queue are the same. If so we do not need to do
     anything else */
  if (strcmp(jobp->ji_qs.ji_queue, req->rq_ind.rq_move.rq_destin) == 0)
    {
    sprintf(log_buf, "Job %s already in queue %s", jobp->ji_qs.ji_jobid, jobp->ji_qs.ji_queue);
    if (LOGLEVEL >= 7)
      {
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }
    
    req_reject(PBSE_JOB_ALREADY_IN_QUEUE, 0, req, NULL, log_buf);
    return(PBSE_NONE);
    }

  switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, &local_errno, req))
    {

    case 0:

      /* success */
      snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob);
      snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager,
        req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);

      reply_ack(req);

      break;

    case - 1:

    case 1:

      /* fail */

      /* NOTE:  can pass detailed response to requestor (NYI) */

      req_reject(local_errno, 0, req, NULL, NULL);

      break;

    case 2:

      /* deferred, will be handled by    */
      /* post_movejob() when the child completes */

      /* NO-OP */

      break;
    }  /* END switch (svr_movejob(jobp,req->rq_ind.rq_move.rq_destin,req)) */

  return(PBSE_NONE);
  }  /* END req_movejob() */
Example #15
0
void
req_relnodesjob(struct batch_request *preq)
{
	int             jt;		/* job type */
	job		*pjob;
	int		rc;
	char		*jid;
	int		i, offset;
	char		*nodeslist = NULL;
	char		msg[LOG_BUF_SIZE];

 
	if (preq == NULL)
		return;

	jid = preq->rq_ind.rq_relnodes.rq_jid;
	if (jid == NULL)
		return;

	/*
	 ** Returns job pointer for singleton job or "parent" of
	 ** an array job.
	 */
	pjob = chk_job_request(jid, preq, &jt);
	if (pjob == NULL) {
		return;
	}

	if (jt == IS_ARRAY_NO) {		/* a regular job is okay */
		/* the job must be running */
		if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) ||
			(pjob->ji_qs.ji_substate !=
			JOB_SUBSTATE_RUNNING)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}
	else if (jt == IS_ARRAY_Single) {	/* a single subjob is okay */

		offset = subjob_index_to_offset(pjob,
			get_index_from_jid(jid));
		if (offset == -1) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}

		i = get_subjob_state(pjob, offset);
		if (i == -1) {
			req_reject(PBSE_IVALREQ, 0, preq);
			return;
		}

		if (i != JOB_STATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
		if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	} else {
		reply_text(preq, PBSE_NOSUP,
			"not supported for Array Jobs or multiple sub-jobs");
		return;
	}

	nodeslist = preq->rq_ind.rq_relnodes.rq_node_list;

	if ((nodeslist != NULL) && (nodeslist[0] == '\0')) {
		nodeslist = NULL;
	}
	rc = free_sister_vnodes(pjob, nodeslist, msg, LOG_BUF_SIZE, preq);

	if (rc != 0) {
		reply_text(preq, PBSE_SYSTEM, msg);
	}
}
Example #16
0
void
req_movejob(struct batch_request *req)
{
	int      jt;            /* job type */
	job	*jobp;
	char	hook_msg[HOOK_MSG_SIZE];

	switch (process_hooks(req, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(req, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(req) == -1) { /* error */
				/* we have to reject the request, as 'req' */
				/* may have been partly modified           */
				strcpy(hook_msg,
					"movejob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(req, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "movejob event: accept req by default");
	}

	jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req, &jt);

	if (jobp == NULL)
		return;

	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, req);
		return;
	}

	if (jobp->ji_qs.ji_state != JOB_STATE_QUEUED &&
		jobp->ji_qs.ji_state != JOB_STATE_HELD &&
		jobp->ji_qs.ji_state != JOB_STATE_WAITING) {
#ifndef NDEBUG
		(void)sprintf(log_buffer, "(%s) %s, state=%d",
			__func__, msg_badstate, jobp->ji_qs.ji_state);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG,
			jobp->ji_qs.ji_jobid, log_buffer);
#endif /* NDEBUG */
		req_reject(PBSE_BADSTATE, 0, req);
		return;
	}

	if (jt != IS_ARRAY_NO) {
		/* cannot move Subjob and can only move array job if */
		/* no subjobs are running			     */
		if ((jt != IS_ARRAY_ArrayJob) ||
			(jobp->ji_ajtrk->tkm_subjsct[JOB_STATE_RUNNING] != 0)) {
			req_reject(PBSE_IVALREQ, 0, req);
			return;
		}
	}

	/*
	 * svr_movejob() does the real work, handles both local and
	 * network moves
	 */

	switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, req)) {
		case 0:			/* success */
			(void)strcpy(log_buffer, msg_movejob);
			(void)sprintf(log_buffer+strlen(log_buffer),
				msg_manager, req->rq_ind.rq_move.rq_destin,
				req->rq_user, req->rq_host);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				jobp->ji_qs.ji_jobid, log_buffer);
			reply_ack(req);
			break;
		case -1:
		case 1:			/* fail */
			if (jobp->ji_clterrmsg)
				reply_text(req, pbs_errno, jobp->ji_clterrmsg);
			else
				req_reject(pbs_errno, 0, req);
			break;
		case 2:			/* deferred, will be handled by 	   */
			/* post_movejob() when the child completes */
			break;
	}
	return;
}
Example #17
0
void
req_orderjob(struct batch_request *req)
{
	int      jt1, jt2;            /* job type */
	job	*pjob;
	job	*pjob1;
	job	*pjob2;
	long	 rank;
	int	 rc;
	char	 tmpqn[PBS_MAXQUEUENAME+1];

	if ((pjob1=chk_job_request(req->rq_ind.rq_move.rq_jid, req, &jt1)) == NULL)
		return;
	if ((pjob2=chk_job_request(req->rq_ind.rq_move.rq_destin, req, &jt2)) == NULL)
		return;
	if ((jt1 == IS_ARRAY_Single) || (jt2 == IS_ARRAY_Single) ||
		(jt1 == IS_ARRAY_Range)  || (jt2 == IS_ARRAY_Range)) {
		/* can only move regular or Array Job, not Subjobs */
		req_reject(PBSE_IVALREQ, 0, req);
		return;
	}

	if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
		((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
		((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_BEGUN)   ||
		((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_BEGUN)) {
#ifndef NDEBUG
		(void)sprintf(log_buffer, "(%s) %s, state=%d",
			__func__, msg_badstate, pjob->ji_qs.ji_state);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG,
			pjob->ji_qs.ji_jobid, log_buffer);
#endif	/* NDEBUG */
		req_reject(PBSE_BADSTATE, 0, req);
		return;
	} else if (pjob1->ji_qhdr != pjob2->ji_qhdr) {

		/* Jobs are in different queues */

		if ((rc = svr_chkque(pjob1, pjob2->ji_qhdr,
			get_hostPart(pjob1->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str),
			MOVE_TYPE_Order)) ||
			(rc = svr_chkque(pjob2, pjob1->ji_qhdr,
			get_hostPart(pjob2->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str),
			MOVE_TYPE_Order))) {
			req_reject(rc, 0, req);
			return;
		}
	}

	/* now swap the order of the two jobs in the queue lists */

	rank = pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
	pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long =
		pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
	pjob1->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_MODCACHE;
	pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = rank;
	pjob2->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_MODCACHE;

	if (pjob1->ji_qhdr != pjob2->ji_qhdr) {
		(void)strcpy(tmpqn, pjob1->ji_qs.ji_queue);
		(void)strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
		(void)strcpy(pjob2->ji_qs.ji_queue, tmpqn);
		svr_dequejob(pjob1);
		svr_dequejob(pjob2);
		(void)svr_enquejob(pjob1);
		(void)svr_enquejob(pjob2);

	} else {
		swap_link(&pjob1->ji_jobque,  &pjob2->ji_jobque);
		swap_link(&pjob1->ji_alljobs, &pjob2->ji_alljobs);
	}

	/* need to update disk copy of both jobs to save new order */

	(void)job_save(pjob1, SAVEJOB_FULL);
	(void)job_save(pjob2, SAVEJOB_FULL);

	reply_ack(req);
}
Example #18
0
void req_movejob(

  struct batch_request *req)

  {
#ifndef NDEBUG
  char *id = "req_movejob";
#endif
  job *jobp;

  jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req);

  if (jobp == NULL)
    {
    return;
    }

  if ((jobp->ji_qs.ji_state != JOB_STATE_QUEUED) &&
      (jobp->ji_qs.ji_state != JOB_STATE_HELD) &&
      (jobp->ji_qs.ji_state != JOB_STATE_WAITING))
    {
#ifndef NDEBUG
    sprintf(log_buffer, "%s %d",
            pbse_to_txt(PBSE_BADSTATE),
            jobp->ji_qs.ji_state);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return;
    }

  /*
   * svr_movejob() does the real work, handles both local and
   * network moves
   */

  switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, req))
    {

    case ROUTE_SUCCESS:

      /* success */

      strcpy(log_buffer, msg_movejob);

      sprintf(log_buffer + strlen(log_buffer), msg_manager,
              req->rq_ind.rq_move.rq_destin,
              req->rq_user,
              req->rq_host);

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        jobp->ji_qs.ji_jobid,
        log_buffer);

      reply_ack(req);

      break;

    case ROUTE_PERM_FAILURE:

    case ROUTE_RETRY:

      /* fail */

      /* NOTE:  can pass detailed response to requestor (NYI) */

      req_reject(pbs_errno, 0, req, NULL, NULL);

      break;

    case ROUTE_DEFERRED:

      /* deferred, will be handled by    */
      /* post_movejob() when the child completes */

      /* NO-OP */

      break;
    }  /* END switch (svr_movejob(jobp,req->rq_ind.rq_move.rq_destin,req)) */

  return;
  }  /* END req_movejob() */
Example #19
0
void *req_checkpointjob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  pbs_attribute *pattr;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory");
      }

    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        pjob = NULL;
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  return(NULL);
  }  /* END req_checkpointjob() */
Example #20
0
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
Example #21
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Example #22
0
void *req_modifyarray(

  void *vp) /* I */

  {
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  job_array            *pa;
  job                  *pjob = NULL;
  svrattrl             *plist;
  int                   checkpoint_req = FALSE;
  char                 *array_spec = NULL;
  char                 *pcnt = NULL;
  int                   rc = 0;
  int                   rc2 = 0;
  struct batch_request *preq = (struct batch_request *)vp;

  pa = get_array(preq->rq_ind.rq_modify.rq_objname);

  if (pa == NULL)
    {
    req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
    return(NULL);
    }

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    reply_ack(preq);

    preq->rq_noreply = TRUE; /* set for no more replies */
    }

  /* pbs_mom sets the extend string to trigger copying of checkpoint files */

  if (preq->rq_extend != NULL)
    {
    if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0)
      {
      checkpoint_req = CHK_HOLD;
      }
    else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0)
      {
      checkpoint_req = CHK_CONT;
      }
    }

  /* find if an array range was specified */
  if ((preq->rq_extend != NULL) && 
      ((array_spec = strstr(preq->rq_extend,ARRAY_RANGE)) != NULL))
    {
    /* move array spec past ARRAY_RANGE= */
    char *equals = strchr(array_spec,'=');
    if (equals != NULL)
      {
      array_spec = equals + 1;
      }

    if ((pcnt = strchr(array_spec,'%')) != NULL)
      {
      int slot_limit = atoi(pcnt+1);
      pa->ai_qs.slot_limit = slot_limit;
      }
    }

  if ((array_spec != NULL) &&
      (pcnt != array_spec))
    {
    if (pcnt != NULL)
      *pcnt = '\0';

    /* there is more than just a slot given, modify that range */
    rc = modify_array_range(pa,array_spec,plist,preq,checkpoint_req);

    if ((rc != 0) && 
       (rc != PBSE_RELAYED_TO_MOM))
      {
      pthread_mutex_unlock(pa->ai_mutex);
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        }

      req_reject(PBSE_IVALREQ,0,preq,NULL,"Error reading array range");
  
      return(NULL);
      }

    if (pcnt != NULL)
      *pcnt = '%';

    if (rc == PBSE_RELAYED_TO_MOM)
      {
      pthread_mutex_unlock(pa->ai_mutex);
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        }

      return(NULL);
      }
    }
  else 
    {
    rc = modify_whole_array(pa,plist,preq,checkpoint_req);

    if ((rc != 0) && 
        (rc != PBSE_RELAYED_TO_MOM))
      {
      pthread_mutex_unlock(pa->ai_mutex);
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        }

      req_reject(PBSE_IVALREQ,0,preq,NULL,"Error altering the array");
      return(NULL);
      }

    /* we modified the job array. We now need to update the job */
    pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);
    rc2 = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);

    if ((rc2) && 
        (rc != PBSE_RELAYED_TO_MOM))
      {
      /* there are two operations going on that give a return code:
         one from modify_whole_array and one from modify_job_for_array.
         If either of these fail, return the error. This makes it
         so some elements fo the array will be updated but others are
         not. But at least the user will know something went wrong.*/
      pthread_mutex_unlock(pa->ai_mutex);
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

      req_reject(rc,0,preq,NULL,NULL);
      return(NULL);
      }

    if (rc == PBSE_RELAYED_TO_MOM)
      {
      pthread_mutex_unlock(pa->ai_mutex);
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
      
      return(NULL);
      }

    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    }

  /* SUCCESS */
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
    }
  pthread_mutex_unlock(pa->ai_mutex);

  reply_ack(preq);

  return(NULL);
  } /* END req_modifyarray() */
Example #23
0
void
req_py_spawn(struct batch_request *preq)
{
	int             jt;		/* job type */
	job		*pjob;
	int		rc;
	char		*jid = preq->rq_ind.rq_py_spawn.rq_jid;
	int		i, offset;

	/*
	 ** Returns job pointer for singleton job or "parent" of
	 ** an array job.
	 */
	pjob = chk_job_request(jid, preq, &jt);
	if (pjob == NULL)
		return;

	/* see if requestor is the job owner */
	if (svr_chk_owner(preq, pjob) != 0) {
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	if (jt == IS_ARRAY_NO) {		/* a regular job is okay */
		/* the job must be running */
		if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) ||
			(pjob->ji_qs.ji_substate !=
			JOB_SUBSTATE_RUNNING)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}
	else if (jt == IS_ARRAY_Single) {	/* a single subjob is okay */

		offset = subjob_index_to_offset(pjob,
			get_index_from_jid(jid));
		if (offset == -1) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}

		i = get_subjob_state(pjob, offset);
		if (i == -1) {
			req_reject(PBSE_IVALREQ, 0, preq);
			return;
		}

		if (i != JOB_STATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
		if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	} else {
		reply_text(preq, PBSE_NOSUP,
			"not supported for Array Jobs or multiple sub-jobs");
		return;
	}

	/*
	 ** Pass the request on to MOM.  If this works, the function
	 ** post_py_spawn_req will be called to handle the reply.
	 ** If it fails, send the reply now.
	 */
	rc = relay_to_mom(pjob, preq, post_py_spawn_req);
	if (rc)
		req_reject(rc, 0, preq);	/* unable to get to MOM */
}
Example #24
0
void
req_releasejob(struct batch_request *preq)
{
	int              jt;            /* job type */
	int		 newstate;
	int		 newsub;
	long		 old_hold;
	job		*pjob;
	char		*pset;
	int		 rc;


	pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq, &jt);
	if (pjob == (job *)0)
		return;

	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	/* cannot do anything until we decode the holds to be set */

	if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if other than HOLD_u is being released, must have privil */

	if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* all ok so far, unset the hold */

	old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;
	rc = job_attr_def[(int)JOB_ATR_hold].
		at_set(&pjob->ji_wattr[(int)JOB_ATR_hold],
		&temphold, DECR);
	if (rc) {
		req_reject(rc, 0, preq);
		return;
	}

	/* every thing went well, if holds changed, update the job state */

#ifndef NAS /* localmod 105 Always reset etime on release */
	if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) {
#endif /* localmod 105 */
#ifdef NAS /* localmod 105 */
		{
			attribute *etime = &pjob->ji_wattr[(int)JOB_ATR_etime];
			etime->at_val.at_long = time_now;
			etime->at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE;
#endif /* localmod 105 */
		pjob->ji_modified = 1;	/* indicates attributes changed    */
		svr_evaljobstate(pjob, &newstate, &newsub, 0);
		(void)svr_setjobstate(pjob, newstate, newsub); /* saves job */
	}
	if (pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long == 0)
		job_attr_def[(int)JOB_ATR_Comment].at_free(&pjob->ji_wattr[(int)JOB_ATR_Comment]);
	(void)sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user,
		preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);
	reply_ack(preq);
}

/**
 * @brief
 * 		get_hold - search a list of attributes (svrattrl) for the hold-types
 * 		attribute.  This is used by the Hold Job and Release Job request,
 *		therefore it is an error if the hold-types attribute is not present,
 *		or there is more than one.
 *
 *		Decode the hold attribute into temphold.
 *
 * @param[in]	phead	- pbs list head.
 * @param[out]	phead	- RETURN - ptr to hold value
 *
 * @return	error code
 */

static int
get_hold(pbs_list_head *phead, char	 **pset)
{
	int		 have_one = 0;
	struct svrattrl *holdattr = (struct svrattrl*)0;
	struct svrattrl *pal;

	pal = (struct svrattrl *)GET_NEXT((*phead));
	while (pal) {
		if (!strcasecmp(pal->al_name, job_attr_def[(int)JOB_ATR_hold].at_name)) {
			holdattr = pal;
			*pset    = pal->al_value;
			have_one++;
		} else {
			return (PBSE_IVALREQ);
		}
		pal = (struct svrattrl *)GET_NEXT(pal->al_link);
	}
	if (have_one != 1)
		return (PBSE_IVALREQ);

	/* decode into temporary attribute structure */

	clear_attr(&temphold, &job_attr_def[(int)JOB_ATR_hold]);
	return (job_attr_def[(int)JOB_ATR_hold].at_decode(
		&temphold,
		holdattr->al_name,
		(char *)0,
		holdattr->al_value));
}
Example #25
0
void
req_holdjob(struct batch_request *preq)
{
	long		*hold_val;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsub;
	long		 old_hold;
	job		*pjob;
	char		*pset;
	int		 rc;
	char             date[32];
	time_t           now;


	pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq, &jt);
	if (pjob == (job *)0)
		return;
	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}
	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}


	/* cannot do anything until we decode the holds to be set */

	if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if other than HOLD_u is being set, must have privil */

	if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* HOLD_bad_password can only be done by root or admin */
#ifdef WIN32
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
				!isAdminPrivilege(preq->rq_user) )
#else
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
		  strcasecmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0 )
#endif
	{
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;
	old_hold = *hold_val;
	*hold_val |= temphold.at_val.at_long;
	pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODCACHE;

	/* Note the hold time in the job comment. */
	now = time(NULL);
	(void)strncpy(date, (const char *)ctime(&now), 24);
	date[24] = '\0';
	(void)sprintf(log_buffer, "Job held by %s on %s", preq->rq_user, date);
	job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, log_buffer);

	(void)sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
		preq->rq_host);

	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate != JOB_SUBSTATE_PRERUN) &&
		(pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str) &&
		(*pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str != 'n')) {

		/* have MOM attempt checkpointing */

		if ((rc = relay_to_mom(pjob, preq, post_hold)) != 0) {
			*hold_val = old_hold;	/* reset to the old value */
			req_reject(rc, 0, preq);
		} else {
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
			pjob->ji_qs.ji_svrflags |=
				JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT;
			(void)job_save(pjob, SAVEJOB_QUICK);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
		}
	} else {

		/* every thing went well, may need to update the job state */

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
			pjob->ji_qs.ji_jobid, log_buffer);
		if (old_hold != *hold_val) {
			/* indicate attributes changed     */
			pjob->ji_modified = 1;
			svr_evaljobstate(pjob, &newstate, &newsub, 0);
			(void)svr_setjobstate(pjob, newstate, newsub);
		}
		reply_ack(preq);
	}
}
Example #26
0
int req_signaljob(

  void *vp)  /* I */

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);
    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(rc, 0, preq, NULL, "can not allocate memory");
    unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
Example #27
0
int req_orderjob(

  struct batch_request *vp) /* I */

  {
  job                  *pjob;
  job                  *pjob1;
  job                  *pjob2;
  int                   rank;
  int                   rc = 0;
  char                  tmpqn[PBS_MAXQUEUENAME+1];
  struct batch_request *req = (struct batch_request *)vp;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue            *pque1;
  pbs_queue            *pque2;

  if ((pjob1 = chk_job_request(req->rq_ind.rq_move.rq_jid, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job1_mutex(pjob1->ji_mutex, true);

  if ((pjob2 = chk_job_request(req->rq_ind.rq_move.rq_destin, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job2_mutex(pjob2->ji_mutex, true);

  if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
      ((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d",
            pbse_to_txt(PBSE_BADSTATE),
            pjob->ji_qs.ji_state);

    strcat(log_buf, __func__);

    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }
  else if ((pjob1->ji_qhdr == NULL) || (pjob2->ji_qhdr == NULL))
    {
    req_reject(PBSE_BADSTATE, 0, req, NULL, "One of the jobs does not have a queue");
    return(PBSE_NONE);
    }
  else if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    /* jobs are in different queues */
    int ok = FALSE;

    if ((pque2 = get_jobs_queue(&pjob2)) == NULL)
      {
      rc = PBSE_BADSTATE;
      job2_mutex.set_lock_on_exit(false);
      }
    else
      {
      mutex_mgr pque2_mutex = mutex_mgr(pque2->qu_mutex, true);
      if ((rc = svr_chkque(pjob1, pque2, get_variable(pjob1, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
        {
        pque2_mutex.unlock();

        if ((pque1 = get_jobs_queue(&pjob1)) == NULL)
          {
          rc = PBSE_BADSTATE;
          job1_mutex.set_lock_on_exit(false);
          }
        else if (pjob1 != NULL)
          {
          mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
          if ((rc = svr_chkque(pjob2, pque1, get_variable(pjob2, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
            {
            ok = TRUE;
            }
          }
        }
      }

    if (ok == FALSE)
      {
      req_reject(rc, 0, req, NULL, NULL);

      return(PBSE_NONE);
      }
    }

  /* now swap the order of the two jobs in the queue lists */
  rank = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long =
    pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long = rank;

  if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    strcpy(tmpqn, pjob1->ji_qs.ji_queue);
    strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
    strcpy(pjob2->ji_qs.ji_queue, tmpqn);

    svr_dequejob(pjob1, FALSE);
    svr_dequejob(pjob2, FALSE);

    if (svr_enquejob(pjob1, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob1 = NULL;
      job1_mutex.set_lock_on_exit(false);
      }

    if (svr_enquejob(pjob2, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob2 = NULL;
      job2_mutex.set_lock_on_exit(false);
      }
    }
  else
    {
    if ((pque1 = get_jobs_queue(&pjob1)) != NULL)
      {
      mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
      swap_jobs(pque1->qu_jobs,pjob1,pjob2);
      swap_jobs(NULL,pjob1,pjob2);
      }
    }

  /* need to update disk copy of both jobs to save new order */
  if (pjob1 != NULL)
    {
    job_save(pjob1, SAVEJOB_FULL, 0);
    }

  if (pjob2 != NULL)
    {
    job_save(pjob2, SAVEJOB_FULL, 0);
    }

  /* SUCCESS */
  reply_ack(req);

  return(PBSE_NONE);
  }  /* END req_orderjob() */
Example #28
0
void *modify_array_work(

  void *vp)

  {
  batch_request *preq = (batch_request *)vp;
  svrattrl      *plist;
  int            rc = 0;
  char          *pcnt = NULL;
  char          *array_spec = NULL;
  int            checkpoint_req = FALSE;
  job           *pjob = NULL;
  job_array     *pa;

  pa = get_array(preq->rq_ind.rq_modify.rq_objname);

  if (pa == NULL)
    {
    req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
    return(NULL);
    }

  mutex_mgr array_mutex(pa->ai_mutex, true);

  /* pbs_mom sets the extend string to trigger copying of checkpoint files */
  if (preq->rq_extend != NULL)
    {
    if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0)
      {
      checkpoint_req = CHK_HOLD;
      }
    else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0)
      {
      checkpoint_req = CHK_CONT;
      }
    }

  /* find if an array range was specified */
  if ((preq->rq_extend != NULL) && 
      ((array_spec = strstr(preq->rq_extend,ARRAY_RANGE)) != NULL))
    {
    /* move array spec past ARRAY_RANGE= */
    char *equals = strchr(array_spec,'=');
    if (equals != NULL)
      {
      array_spec = equals + 1;
      }

    if ((pcnt = strchr(array_spec,'%')) != NULL)
      {
      int slot_limit = atoi(pcnt+1);
      pa->ai_qs.slot_limit = slot_limit;
      }
    }
  
  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if ((array_spec != NULL) &&
      (pcnt != array_spec))
    {
    if (pcnt != NULL)
      *pcnt = '\0';

    /* there is more than just a slot given, modify that range */
    rc = modify_array_range(pa,array_spec,plist,preq,checkpoint_req);

    if (pcnt != NULL)
      *pcnt = '%';

    if ((rc != 0) && 
        (rc != PBSE_RELAYED_TO_MOM))
      {
      req_reject(PBSE_IVALREQ,0,preq,NULL,"Error reading array range");
      return(NULL);
      }
    else
      reply_ack(preq);

    return(NULL);
    }
  else 
    {
    rc = modify_whole_array(pa,plist,preq,checkpoint_req);

    if ((rc != 0) && 
        (rc != PBSE_RELAYED_TO_MOM))
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "At least one array element did not modify successfully. Use qstat -f to verify changes");
      return(NULL);
      }

    /* we modified the job array. We now need to update the job */
    if ((pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq)) == NULL)
      return(NULL);

    mutex_mgr job_mutex = mutex_mgr(pjob->ji_mutex, true);

    /* modify_job will reply to preq and free it */
    modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);
    }

  return(NULL);
  } /* END modify_array_work() */
Example #29
0
int req_rerunjob(
   
  batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */
  if (!strcasecmp(preq->rq_ind.rq_rerun, "all"))
    {
    return(handle_requeue_all(preq));
    }
  
  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
    {
    // If we are already queued, then there is nothing to do.
    rc = PBSE_NONE;
    reply_ack(preq);
    return(rc);
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    int                 delay = 0;
    pbs_queue          *pque;
  
    // Apply the user delay first so it takes precedence.
    if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
      delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
      mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

      if (delay == 0)
        {
        delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                               &server.sv_attr[SRV_ATR_KillDelay],
                               0);
        }
      }
    else
      {
      /* why is the pque null. Something went wrong */
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
      req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf);
      return(PBSE_UNKQUE);
      }
    
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

    if (delay != 0)
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      get_batch_request_id(preq);
      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if ((preq->rq_extend) && 
          (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))))
        {
        std::string extend = RERUNFORCE;
        batch_request *dup = new batch_request(*preq);
        get_batch_request_id(dup);
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str()));

        if (rc == PBSE_NORELYMOM)
          {
          dup->rq_reply.brp_code = PBSE_NORELYMOM;
          pjob_mutex.unlock();
          post_rerun(dup);

          pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE);
          if (pjob == NULL)
            {
            delete dup;
            return(PBSE_NONE);
            }

          pjob_mutex.set_lock_state(true);
          rc = PBSE_NONE;
          }

        delete dup;
        }
      else
        {
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str()));
        if (rc != PBSE_NONE)
          {
          /* cant send to MOM */
          req_reject(rc, 0, preq, NULL, NULL);
          }

        return(rc);
        }
      }
    else
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
        {
        std::string extend = RERUNFORCE;
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str()));
        if (rc == PBSE_NORELYMOM)
          rc = PBSE_NONE;
        }
      else
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL);
      }
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  /* finalize_rerunjob will return with pjob->ji_mutex unlocked */
  pjob_mutex.set_unlock_on_exit(false);
  return finalize_rerunjob(preq,pjob,rc);
  }
Example #30
0
void req_orderjob(

  struct batch_request *req)  /* I */

  {
#ifndef NDEBUG
  char *id = "req_orderjob";
#endif
  job *pjob;
  job *pjob1;
  job *pjob2;
  int  rank;
  int  rc;
  char  tmpqn[PBS_MAXQUEUENAME+1];

  if ((pjob1 = chk_job_request(req->rq_ind.rq_move.rq_jid, req)) == NULL)
    {
    return;
    }

  if ((pjob2 = chk_job_request(req->rq_ind.rq_move.rq_destin, req)) == NULL)
    {
    return;
    }

  if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
      ((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING))
    {
#ifndef NDEBUG
    sprintf(log_buffer, "%s %d",
            pbse_to_txt(PBSE_BADSTATE),
            pjob->ji_qs.ji_state);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return;
    }
  else if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    /* jobs are in different queues */

    if ((rc = svr_chkque(
                pjob1,
                pjob2->ji_qhdr,
                get_variable(pjob1, pbs_o_host),
                MOVE_TYPE_Order,
                NULL)) ||
        (rc = svr_chkque(
                pjob2,
                pjob1->ji_qhdr,
                get_variable(pjob2, pbs_o_host),
                MOVE_TYPE_Order,
                NULL)))
      {
      req_reject(rc, 0, req, NULL, NULL);

      return;
      }
    }

  /* now swap the order of the two jobs in the queue lists */

  rank = pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;

  pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long =
    pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;

  pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = rank;

  if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    (void)strcpy(tmpqn, pjob1->ji_qs.ji_queue);
    (void)strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
    (void)strcpy(pjob2->ji_qs.ji_queue, tmpqn);
    svr_dequejob(pjob1);
    svr_dequejob(pjob2);
    (void)svr_enquejob(pjob1);
    (void)svr_enquejob(pjob2);

    }
  else
    {
    swap_link(&pjob1->ji_jobque,  &pjob2->ji_jobque);
    swap_link(&pjob1->ji_alljobs, &pjob2->ji_alljobs);
    }

  /* need to update disk copy of both jobs to save new order */

  job_save(pjob1, SAVEJOB_FULL);

  job_save(pjob2, SAVEJOB_FULL);

  reply_ack(req);

  /* SUCCESS */

  return;
  }  /* END req_orderjob() */