예제 #1
0
static void
post_chkpt(struct work_task *ptask)
{
	job		     *pjob;
	struct batch_request *preq;

	preq = (struct batch_request *)ptask->wt_parm1;
	pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);
	if (!preq || !pjob)
		return;
	if (preq->rq_reply.brp_code == 0) {
		/* checkpointed ok */
		if (preq->rq_reply.brp_auxcode) { /* chkpt can be moved */
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_ChkptMig;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
		}
		account_record(PBS_ACCT_CHKPNT, pjob, (char *)0);
	} else {
		/* need to try rerun if possible or just abort the job */
		if (preq->rq_reply.brp_code != PBSE_CKPBSY) {
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
			if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
				rerun_or_kill(pjob, msg_on_shutdown);
		}
	}

	release_req(ptask);
}
예제 #2
0
파일: job_recov.c 프로젝트: hocks/torque
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  job  *pj;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  int   rc;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */
  size_t logBufLen = sizeof(log_buf);

  if ((rc = job_recov_xml(namebuf, &pj, log_buf, logBufLen)) && rc == PBSE_INVALID_SYNTAX)
    rc = job_recov_binary(namebuf, &pj, log_buf, logBufLen);

  if (rc == PBSE_NONE)
    rc = set_array_job_ids(&pj, log_buf, logBufLen);

  if (rc != PBSE_NONE) 
    {
    if (rc == -1) 
      {
      log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
      free(pj->ji_mutex);
#endif
      free((char *)pj);
      } /* sometime pjob is freed by abt_job() */
    return(NULL);
    }
  
  
  pj->ji_commit_done = 1;

  /* all done recovering the job */

#ifdef PBS_MOM
  job_save(pj, SAVEJOB_FULL, (multi_mom == 0)?0:pbs_rm_port);
#else
  job_save(pj, SAVEJOB_FULL, 0);
#endif

  return(pj);
  }  /* END job_recov() */
예제 #3
0
파일: req_holdjob.c 프로젝트: CESNET/torque
void req_checkpointjob(

  struct batch_request *preq)

  {
  job    *pjob;
  int     rc;
  attribute *pattr;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return;
    }

  if (is_cloud_job(pjob))
    {
    rc = PBSE_CLOUD_REQUEST;
    req_reject(rc, 0, preq, NULL, "cloud jobs cannot be checkpointed");
    }

  pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                           preq, process_checkpoint_reply)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
      job_save(pjob, SAVEJOB_QUICK);
      LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid, log_buffer);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }
  }  /* END req_checkpointjob() */
예제 #4
0
파일: array_func.c 프로젝트: altair4/pbspro
/**
 * @brief
 * 		chk_array_doneness - check if all subjobs are expired and if so,
 *		purge the Array Job itself
 *
 * @param[in,out]	parent - pointer to parent job.
 *
 *	@return	void
 */
void
chk_array_doneness(job *parent)
{
	char acctbuf[40];
	int e;
	int i;
	struct ajtrkhd	*ptbl = parent->ji_ajtrk;

	if (ptbl == NULL)
		return;

	if (ptbl->tkm_flags & TKMFLG_NO_DELETE)
		return;	/* delete of subjobs in progress, don't array */

	if (ptbl->tkm_subjsct[JOB_STATE_QUEUED] + ptbl->tkm_subjsct[JOB_STATE_RUNNING] + ptbl->tkm_subjsct[JOB_STATE_EXITING] == 0) {

		/* Array Job all done, do simple eoj processing */

		for (e=i=0; i<ptbl->tkm_ct; ++i) {
			if (ptbl->tkm_tbl[i].trk_error > 0)
				e = 1;
			else if (ptbl->tkm_tbl[i].trk_error < 0) {
				e = 2;
				break;
			}
		}
		parent->ji_qs.ji_un_type = JOB_UNION_TYPE_EXEC;
		parent->ji_qs.ji_un.ji_exect.ji_momaddr = 0;
		parent->ji_qs.ji_un.ji_exect.ji_momport = 0;
		parent->ji_qs.ji_un.ji_exect.ji_exitstat = e;

		check_block(parent, "");
		if (parent->ji_qs.ji_state == JOB_STATE_BEGUN) {
			/* if BEGUN, issue 'E' account record */
			sprintf(acctbuf, msg_job_end_stat, e);
			account_job_update(parent, PBS_ACCT_LAST);
			account_jobend(parent, acctbuf, PBS_ACCT_END);

			svr_mailowner(parent, MAIL_END, MAIL_NORMAL, acctbuf);
		}
		if (parent->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET)
			(void)depend_on_term(parent);

		/*
		 * Check if the history of the finished job can be saved or it needs to be purged .
		 */
		svr_saveorpurge_finjobhist(parent);
	} else {
		(void)job_save(parent, SAVEJOB_FULL);
	}
}
예제 #5
0
int
shutdown_preempt_chkpt(job *pjob)
{
	struct batch_request *phold;
	attribute temp;
	void (*func)(struct work_task *);

	long *hold_val = NULL;
	long old_hold = 0;

	phold = alloc_br(PBS_BATCH_HoldJob);
	if (phold == NULL)
		return (PBSE_SYSTEM);

	temp.at_flags = ATR_VFLAG_SET;
	temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
	temp.at_user_encoded = NULL;
	temp.at_priv_encoded = NULL;
	temp.at_val.at_long = HOLD_s;

	phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;
	(void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);
	CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);
	if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp,
		&phold->rq_ind.rq_hold.rq_orig.rq_attr,
		job_attr_def[(int)JOB_ATR_hold].at_name,
		NULL,
		ATR_ENCODE_CLIENT, NULL) < 0)
		return (PBSE_SYSTEM);

	phold->rq_extra = pjob;
	func = post_chkpt;

	if (relay_to_mom(pjob, phold, func) == 0) {

		if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
			svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);
		pjob->ji_qs.ji_svrflags |= (JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT | JOB_SVFLG_HASHOLD);
		pjob->ji_modified = 1;
		(void)job_save(pjob, SAVEJOB_QUICK);
		return (0);
	} else {
		*hold_val = old_hold;	/* reset to the old value */
		return (-1);
	}
}
예제 #6
0
int attempt_to_queue_job_on_mom(

  char          *job_id,
  int            con,
  char          *job_destin,
  bool          &change_substate_on_attempt_to_queue,
  tlist_head    &attrl,
  bool          &timeout,
  bool           need_to_send_job_script,
  bool           job_has_run,
  unsigned long  job_momaddr,
  const char    *script_name,
  char          *stdout_path,
  char          *stderr_path,
  char          *chkpt_path,
  int            type,
  int           *my_err)

  {
  job  *pjob = NULL;
  int   rc;

  if (update_substate_if_needed(job_id, change_substate_on_attempt_to_queue) != PBSE_NONE)
    return(LOCUTION_FAIL);

  if ((rc = queue_job_on_mom(con, my_err, job_id, job_destin, attrl, timeout, type)) != PBSE_NONE)
    return(rc);

  if ((rc = send_job_script_if_needed(con, need_to_send_job_script, script_name, job_id)) != PBSE_NONE)
    return(rc);

  if ((rc = send_files_if_needed(con, job_id, type, job_has_run, job_momaddr, stdout_path, stderr_path, chkpt_path)) != PBSE_NONE)
    return(rc);

  if ((pjob = svr_find_job(job_id, TRUE)) != NULL)
    {
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;      
    job_save(pjob, SAVEJOB_QUICK, 0);
    unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
    }
  else
    return(LOCUTION_FAIL);

  return(LOCUTION_SUCCESS);
  } /* END attempt_to_queue_job_on_mom() */
예제 #7
0
static int
shutdown_chkpt(job *pjob)
{
	struct batch_request *phold;
	attribute 	      temp;

	phold = alloc_br(PBS_BATCH_HoldJob);
	if (phold == (struct batch_request *)0)
		return (PBSE_SYSTEM);

	temp.at_flags = ATR_VFLAG_SET;
	temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
	temp.at_user_encoded = NULL;
	temp.at_priv_encoded = NULL;
	temp.at_val.at_long = HOLD_s;

	phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;
	(void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);
	CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);
	if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp,
		&phold->rq_ind.rq_hold.rq_orig.rq_attr,
		job_attr_def[(int)JOB_ATR_hold].at_name,
		(char *)0,
		ATR_ENCODE_CLIENT, NULL) < 0)
		return (PBSE_SYSTEM);

	if (relay_to_mom(pjob, phold, post_chkpt) == 0) {

		if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
			svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);

		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHKPT;
		pjob->ji_modified = 1;
		(void)job_save(pjob, SAVEJOB_QUICK);
		return (0);
	} else
		return (-1);
}
예제 #8
0
int update_substate_if_needed(

  char *job_id,
  bool &change_substate_on_attempt_to_queue)

  {
  if (change_substate_on_attempt_to_queue == true)
    {
    job *pjob = svr_find_job(job_id, TRUE);

    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
      job_save(pjob, SAVEJOB_QUICK, 0);
      }
    else
      {
      return(PBSE_JOB_RECYCLED);
      }
    }

  return(PBSE_NONE);
  } /* END update_substate_if_needed() */
예제 #9
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);
  char                 *msg_ptr = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preply = &preq->rq_reply;

  if (preply->brp_un.brp_txt.brp_str != NULL)
    {
    msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL);
  
    if (msg_ptr != NULL)
      msg_ptr += strlen(PBS_MSG_EQUAL);
    }

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) &&
           (preply->brp_code == PBSE_UNKJOBID) &&
           (msg_ptr != NULL) &&
           (!strcmp(msg_ptr,  preq->rq_ind.rq_status.rq_id)))
    {
    /* we sent a stat request, but mom says it doesn't know anything about
       the job */
    if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
      {
      /* job really isn't running any more - mom doesn't know anything about it
         this can happen if a diskless node reboots and the mom_priv/jobs
         directory is cleared, set its state to queued so job_abt doesn't
         think it is still running */
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      snprintf(log_buf, sizeof(log_buf),
        "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld",
        preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
      rel_resc(pjob);
      job_mutex.set_unlock_on_exit(false);
      job_abt(&pjob, "Job does not exist on node");

      /* TODO, if the job is rerunnable we should set its state back to queued */
      }
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id);
    log_err(preply->brp_code, __func__, log_buf);
    }
  
  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
예제 #10
0
파일: req_modify.c 프로젝트: dhill12/test
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */
예제 #11
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);

  preply = &preq->rq_reply;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }
  cntl->sc_conn = -1;

  /* MUTSU - Unlock job here? */
  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
예제 #12
0
void scan_for_terminated(void)

  {
  static char id[] = "scan_for_terminated";

  int  exiteval = 0;
  pid_t  pid;
  job *pjob;
  task *ptask = NULL;
  int  statloc;
  unsigned int momport = 0;

  int    tcount;

  if (LOGLEVEL >= 7)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      id,
      "entered");
    }

  /* update the latest intelligence about the running jobs;         */
  /* must be done before we reap the zombies, else we lose the info */

  termin_child = 0;

  if (mom_get_sample() == PBSE_NONE)
    {
    pjob = (job *)GET_PRIOR(svr_alljobs);

    while (pjob != NULL)
      {
      mom_set_use(pjob);

      pjob = (job *)GET_PRIOR(pjob->ji_alljobs);
      }
    }

  /* Now figure out which task(s) have terminated (are zombies) */

  /* NOTE:  does a job's tasks include its epilog? */

  while ((pid = waitpid(-1, &statloc, WNOHANG)) > 0)
    {
    pjob = (job *)GET_PRIOR(svr_alljobs);

    while (pjob != NULL)
      {
      /*
       * see if process was a child doing a special
       * function for MOM
       */

      if (LOGLEVEL >= 7)
        {
        snprintf(log_buffer, 1024, "checking job w/subtask pid=%d (child pid=%d)",
          pjob->ji_momsubt,
          pid);

        LOG_EVENT(
          PBSEVENT_DEBUG,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buffer);
        }

      if (pid == pjob->ji_momsubt)
        {
        if (LOGLEVEL >= 7)
          {
          snprintf(log_buffer, 1024, "found match with job subtask for pid=%d",
            pid);

          LOG_EVENT(
            PBSEVENT_DEBUG,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buffer);
          }

        break;
        }

      /* look for task */

      ptask = (task *)GET_NEXT(pjob->ji_tasks);

      /* locate task with associated process id */

      tcount = 0;

      while (ptask != NULL)
        {
        if (ptask->ti_qs.ti_sid == pid)
          {
          if (LOGLEVEL >= 7)
            {
            snprintf(log_buffer, 1024, "found match with job task %d for pid=%d",
              tcount,
              pid);

            LOG_EVENT(
              PBSEVENT_DEBUG,
              PBS_EVENTCLASS_JOB,
              pjob->ji_qs.ji_jobid,
              log_buffer);
            }

          break;
          }

        ptask = (task *)GET_NEXT(ptask->ti_jobtask);

        tcount++;
        }  /* END while (ptask) */

      if (ptask != NULL)
        {
        /* pid match located - break out of job loop */

        break;
        }

      pjob = (job *)GET_PRIOR(pjob->ji_alljobs);
      }  /* END while (pjob != NULL) */

    if (pjob == NULL)
      {
      if (LOGLEVEL >= 1)
        {
        sprintf(log_buffer, "pid %d not tracked, exitcode=%d",
          pid,
          statloc);

        log_record(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          id,
          log_buffer);
        }

      continue;
      }  /* END if (pjob == NULL) */

    if (WIFEXITED(statloc))
      exiteval = WEXITSTATUS(statloc);
    else if (WIFSIGNALED(statloc))
      exiteval = WTERMSIG(statloc) + 0x100;
    else
      exiteval = 1;

    if (pid == pjob->ji_momsubt)
      {
      /* PID matches job mom subtask */

      /* NOTE:  both ji_momsubt and ji_mompost normally set in routine
                preobit_reply() after epilog child is successfully forked */

      if (pjob->ji_mompost != NULL)
        {
        if (pjob->ji_mompost(pjob, exiteval) == 0)
          {
          /* success */

          pjob->ji_mompost = NULL;
          }

        }  /* END if (pjob->ji_mompost != NULL) */
      else
        {
        log_record(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          "job has no postprocessing routine registered");
        }

      /* clear mom sub-task */

      pjob->ji_momsubt = 0;

      if(multi_mom)
        {
        momport = pbs_rm_port;
        }

      job_save(pjob, SAVEJOB_QUICK, momport);

      continue;
      }  /* END if (pid == pjob->ji_momsubt) */

    /* what happens if mom PID is reaped before subtask? */

    if (LOGLEVEL >= 2)
      {
      sprintf(log_buffer, "pid %d harvested for job %s, task %d, exitcode=%d",
              pid,
              pjob->ji_qs.ji_jobid,
              ptask->ti_qs.ti_task,
              exiteval);

      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        id,
        log_buffer);
      }

    /* where is job purged?  How do we keep job from progressing in state until the obit is sent? */

    kill_task(ptask, SIGKILL, 0);

    ptask->ti_qs.ti_exitstat = exiteval;

    ptask->ti_qs.ti_status   = TI_STATE_EXITED;

    task_save(ptask);

    sprintf(log_buffer, "%s: job %s task %d terminated, sid=%d",
            id,
            pjob->ji_qs.ji_jobid,
            ptask->ti_qs.ti_task,
            ptask->ti_qs.ti_sid);

    LOG_EVENT(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    exiting_tasks = 1;
    }  /* END while ((pid = waitpid(-1,&statloc,WNOHANG)) > 0) */

  return;
  }  /* END scan_for_terminated() */
예제 #13
0
void
req_modifyjob(struct batch_request *preq)
{
	int		 add_to_am_list = 0; /* if altered during sched cycle */
	int		 bad = 0;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsubstate;
	resource_def	*outsideselect = NULL;
	job		*pjob;
	svrattrl	*plist;
	resource	*presc;
	resource_def	*prsd;
	int		 rc;
	int		 running = 0;
	int		 sendmom = 0;
	char		hook_msg[HOOK_MSG_SIZE];
	int		mod_project = 0;
	pbs_sched	*psched;

	switch (process_hooks(preq, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(preq, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(preq) == -1) { /* error */
				/* we have to reject the request, as 'preq' */
				/* may have been partly modified            */
				strcpy(hook_msg,
					"modifyjob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(preq, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "modifyjob event: accept req by default");
	}

	if (pseldef == NULL)  /* do one time to keep handy */
		pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size);

	pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt);
	if (pjob == NULL)
		return;

	if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	psched = find_sched_from_sock(preq->rq_conn);
	/* allow scheduler to modify job */
	if (psched == NULL) {
		/* provisioning job is not allowed to be modified */
		if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
			(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}

	/* cannot be in exiting or transit, exiting has already be checked */

	if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	if (plist == NULL) {	/* nothing to do */
		reply_ack(preq);
		return;
	}

	/*
	 * Special checks must be made:
	 *	if during a scheduling cycle and certain attributes are altered,
	 *	   make a note of the job to prevent it from being run now;
	 *	if job is running, only certain attributes/resources can be
	 *	   altered.
	 */

	if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {
		running = 1;
	}
	while (plist) {
		int i;

		i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

		/*
		 * Is the attribute being altered one which could change
		 * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling
		 * cycle is in progress, then set flag to add the job to list
		 * of jobs which cannot be run in this cycle.
		 * If the scheduler itself sends a modify job request,
		 * no need to delay the job until next cycle.
		 */
		if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT))
			add_to_am_list = 1;

		/* Is the attribute modifiable in RUN state ? */

		if (i < 0) {
			reply_badattr(PBSE_NOATTR, 1, plist, preq);
			return;
		}
		if ((running == 1) &&
			((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) {

			reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
			return;
		}
		if (i == (int)JOB_ATR_resource) {

			prsd = find_resc_def(svr_resc_def, plist->al_resc,
				svr_resc_size);

			if (prsd == 0) {
				reply_badattr(PBSE_UNKRESC, 1, plist, preq);
				return;
			}

			/* is the specified resource modifiable while */
			/* the job is running                         */

			if (running) {

				if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) {
					reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
					return;
				}

				sendmom = 1;
			}

			/* should the resource be only in a select spec */

			if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect &&
				plist->al_atopl.value && plist->al_atopl.value[0]) {
				/* if "-lresource" is set and has non-NULL value,
				** remember as potential bad resource
				** if this appears along "select".
				*/
				outsideselect = prsd;
			}
		}
		if (strcmp(plist->al_name, ATTR_project) == 0) {
			mod_project = 1;
		} else if ((strcmp(plist->al_name, ATTR_runcount) == 0) &&
			((plist->al_flags & ATR_VFLAG_HOOK) == 0) &&
			(plist->al_value != NULL) &&
			(plist->al_value[0] != '\0') &&
			((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) &&
		(atol(plist->al_value) < \
		    pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) {
			sprintf(log_buffer,
				"regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld",
				preq->rq_user, preq->rq_host, ATTR_runcount,
				pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long,
				atol(plist->al_value));
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR,
				pjob->ji_qs.ji_jobid, log_buffer);
			req_reject(PBSE_PERM, 0, preq);
			return;
		}
		plist = (svrattrl *)GET_NEXT(plist->al_link);
	}

	if (outsideselect) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc &&
			((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) {
			/* select is not a default, so reject qalter */

			resc_in_err = strdup(outsideselect->rs_name);
			req_reject(PBSE_INVALJOBRESC, 0, preq);
			return;
		}

	}

	/* modify the jobs attributes */

	bad = 0;
	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);
	if (rc) {
		if (pjob->ji_clterrmsg)
			reply_text(preq, rc, pjob->ji_clterrmsg);
		else
			reply_badattr(rc, bad, plist, preq);
		return;
	}

	/* If certain attributes modified and if in scheduling cycle  */
	/* then add to list of jobs which cannot be run in this cycle */

	if (add_to_am_list)
		am_jobs_add(pjob);	/* see req_runjob() */

	/* check if project attribute was requested to be modified to */
	/* be the default project value */
	if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \
							ATR_VFLAG_SET)) {

		if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str,
			PBS_DEFAULT_PROJECT) == 0) {
			sprintf(log_buffer, msg_defproject,
				ATTR_project, PBS_DEFAULT_PROJECT);
#ifdef NAS /* localmod 107 */
			log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#else
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#endif /* localmod 107 */
		}
	}

	if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) {
			/* changing Resource_List and select is a default   */
			/* clear "select" so it is rebuilt inset_resc_deflt */
			pseldef->rs_free(&presc->rs_value);
		}
	}

	/* Reset any defaults resource limit which might have been unset */
	if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if job is not running, may need to change its state */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		(void)job_save(pjob, SAVEJOB_FULL);
	}
	(void)sprintf(log_buffer, msg_manager, msg_jobmod,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* if a resource limit changed for a running job, send to MOM */

	if (sendmom) {
		rc = relay_to_mom(pjob, preq, post_modify_req);
		if (rc)
			req_reject(rc, 0, preq);    /* unable to get to MOM */
		return;
	}

	reply_ack(preq);
}
예제 #14
0
void post_signal_req(

  batch_request *preq)

  {
  char                 *jobid;
  job                  *pjob;

  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* request has been handled elsewhere */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if (preq->rq_reply.brp_code)
    {
    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_REQUEST,
      preq->rq_ind.rq_signal.rq_jid,
      pbse_to_txt(PBSE_MOMREJECT));

    errno = 0;

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    }
  else
    {
    if ((jobid = preq->rq_extra) == NULL)
      {
      log_err(ENOMEM, __func__, (char *)"Cannot allocate memory! FAILURE");
      return;
      }

    if ((pjob = svr_find_job(jobid, FALSE)) != NULL)
      {
      if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0)
        {
        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0)
          {
          pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend;
          
          set_statechar(pjob);
          
          job_save(pjob, SAVEJOB_QUICK, 0);
          
          /* release resources allocated to suspended job - NORWAY */
          
          free_nodes(pjob);
          }
        }
      else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend)
          {
          /* re-allocate assigned node to resumed job - NORWAY */
          
          set_old_nodes(pjob);
          
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend;
          
          set_statechar(pjob);
          
          job_save(pjob, SAVEJOB_QUICK, 0);
          }
        }
    
      unlock_ji_mutex(pjob, __func__, (char *)"5", LOGLEVEL);
      }
    else
      {
      /* job is gone */
      snprintf(log_buf,sizeof(log_buf),
        "Cannot find job '%s', assuming success",
        jobid);
      log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }

    free(jobid);

    reply_ack(preq);
    }

  return;
  }  /* END post_signal_req() */
예제 #15
0
파일: req_delete.c 프로젝트: dhill12/test
void purge_completed_jobs(

  struct batch_request *preq)  /* I */

  {
  job          *pjob;
  char         *time_str;
  time_t        purge_time = 0;
  int           iter;
  char          log_buf[LOCAL_LOG_BUF_SIZE];

  /* get the time to purge the jobs that completed before */
  time_str = preq->rq_extend;
  time_str += strlen(PURGECOMP);
  purge_time = strtol(time_str,NULL,10);
  
  /*
    * Clean unreported capability is only for operators and managers.
    * Check if request is authorized
  */

  if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR|
                    ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0)
    {
    req_reject(PBSE_PERM,0,preq,NULL,
      "must have operator or manager privilege to use -c parameter");
    return;
    }
    
  reply_ack(preq);

  if (LOGLEVEL >= 4)
    {
    sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)",
      (long)purge_time, preq->rq_extend);

    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL) 
    {
    if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) &&
        (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) &&
        ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) &&
        (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0))
      {
      if (LOGLEVEL >= 4)
        {
        sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE",
          pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long);
        
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      
      pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1;
      pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
          
      job_save(pjob, SAVEJOB_FULL, 0); 
      }

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }


  return;
  } /* END purge_completed_jobs() */
예제 #16
0
파일: mom_start.c 프로젝트: msbritt/torque
void scan_for_terminated(void) /* linux */

  {
  int           exiteval = 0;
  pid_t         pid;
  job          *pjob = NULL;
  task         *ptask = NULL;
  int           statloc;
  unsigned int  momport = 0;

#ifdef USESAVEDRESOURCES
  int           update_stats = TRUE;
#endif /* USESAVEDRESOURCES */

  int           tcount;

  if (LOGLEVEL >= 9)
    {
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, "entered");
    }

  /* update the latest intelligence about the running jobs;         */
  /* must be done before we reap the zombies, else we lose the info */

  termin_child = 0;

  if (mom_get_sample() == PBSE_NONE)
    {
    std::list<job *>::reverse_iterator iter;

    // get a list of jobs in start time order, first to last
    for (iter = alljobs_list.rbegin(); iter != alljobs_list.rend(); iter++)
      {
      pjob = *iter;

      if ((pjob->ji_stats_done == true) || 
          (pjob->ji_qs.ji_state < JOB_STATE_RUNNING))
        continue;

#ifdef USESAVEDRESOURCES
      ptask = (task *)GET_NEXT(pjob->ji_tasks);

      /*
       ** check task with associated process id to see if we are recovering
       ** after a mom restart where process completed while we were gone
        */
      
      while (ptask != NULL)
        {
        if (ptask->ti_flags & TI_FLAGS_RECOVERY)
          {
          if (LOGLEVEL >= 7)
            {
            snprintf(log_buffer, sizeof(log_buffer), "Found match for recovering job task for sid=%d",
              ptask->ti_qs.ti_sid);

            log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            }
          
          update_stats = FALSE;
          break;
          }

        ptask = (task *)GET_NEXT(ptask->ti_jobtask);
        
        }  /* END while (ptask) */
      
      if (update_stats)
        {
        mom_set_use(pjob);
        }
#else

      mom_set_use(pjob);

#endif /* USESAVEDRESOURCES */
      }
    }

  /* Now figure out which task(s) have terminated (are zombies) */

  /* NOTE:  does a job's tasks include its epilog? */

  while ((pid = waitpid(-1, &statloc, WNOHANG)) > 0)
    {
    std::list<job *>::reverse_iterator iter;

    if (LOGLEVEL >= 8)
      {
      sprintf(log_buffer, "Child exited with pid: %d", pid);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer);
      }

    // get a list of jobs in start time order, first to last
    for (iter = alljobs_list.rbegin(); iter != alljobs_list.rend(); iter++)
      {
      pjob = *iter;

      /*
       * see if process was a child doing a special
       * function for MOM
       */

      if (pjob->ji_momsubt != 0)
        {
        if (LOGLEVEL >= 9)
          {
          snprintf(log_buffer, sizeof(log_buffer),
            "Checking to see if exiting child pid '%d' is a match for special mom task with pid=%d",
            pid, pjob->ji_momsubt);

          log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
          }

        if (pid == pjob->ji_momsubt)
          {
          if (LOGLEVEL >= 9)
            {
            snprintf(log_buffer, sizeof(log_buffer),
              "The exiting child is a match of special subtask with pid=%d for job %s",
              pid, pjob->ji_qs.ji_jobid);

            log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            }

          break;
          }
        }

      /* look for task */

      ptask = (task *)GET_NEXT(pjob->ji_tasks);

      /* locate task with associated process id */

      tcount = 0;

      while (ptask != NULL)
        {
        if ((ptask->ti_qs.ti_sid == pid) &&
            (ptask->ti_qs.ti_status != TI_STATE_EXITED))
          {
          if (LOGLEVEL >= 7)
            {
            snprintf(log_buffer, sizeof(log_buffer),
              "Exiting child matches job task %d for pid=%d",
              tcount,
              pid);

            log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            }

          break;
          }

        ptask = (task *)GET_NEXT(ptask->ti_jobtask);

        tcount++;
        }  /* END while (ptask) */

      // make sure the task is the top level task for the job to mark the job done
      if ((ptask != NULL) &&
          (ptask->ti_qs.ti_parenttask == TM_NULL_TASK))
        {
        /* pid match located - break out of job loop */
        pjob->ji_stats_done = true;

        break;
        }

      }  /* END while (pjob != NULL) */

    if (WIFEXITED(statloc))
      exiteval = WEXITSTATUS(statloc);
    else if (WIFSIGNALED(statloc))
      exiteval = WTERMSIG(statloc) + 0x100;
    else
      exiteval = 1;

    if (pjob == NULL)
      {
      if (LOGLEVEL >= 1)
        {
        sprintf(log_buffer, "Child pid %d is not part of a job, statloc=%d, exitval=%d",
          pid,
          statloc,
          exiteval);

        log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer);
        }

      continue;
      }  /* END if (pjob == NULL) */

    if (pid == pjob->ji_momsubt)
      {
      /* PID matches job mom subtask */

      /* NOTE:  both ji_momsubt and ji_mompost normally set in routine
                preobit_reply() after epilog child is successfully forked */

      if (pjob->ji_mompost != NULL)
        {
        if (pjob->ji_mompost(pjob, exiteval) == 0)
          {
          /* success */

          pjob->ji_mompost = NULL;
          }

        }  /* END if (pjob->ji_mompost != NULL) */
      else if (LOGLEVEL >= 8) // This is a debug statement
        {
        log_record(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          "Job has no postprocessing routine registered");
        }

      /* clear mom sub-task */

      pjob->ji_momsubt = 0;

      if (multi_mom)
        {
        momport = pbs_rm_port;
        }

      job_save(pjob, SAVEJOB_QUICK, momport);

      continue;
      }  /* END if (pid == pjob->ji_momsubt) */

    if (ptask == NULL)
      continue;

    /* what happens if mom PID is reaped before subtask? */

    if (LOGLEVEL >= 2)
      {
      sprintf(log_buffer, "pid %d harvested for job %s, task %d, exitcode=%d",
              pid,
              pjob->ji_qs.ji_jobid,
              ptask->ti_qs.ti_task,
              exiteval);

      log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buffer);
      }

    /* where is job purged?  How do we keep job from progressing in state until the obit is sent? */

    kill_task(pjob, ptask, SIGKILL, 0);

    ptask->ti_qs.ti_exitstat = exiteval;

    ptask->ti_qs.ti_status   = TI_STATE_EXITED;

    task_save(ptask);

    sprintf(log_buffer, "%s: job %s task %d terminated, sid=%d",
      __func__,
      pjob->ji_qs.ji_jobid,
      ptask->ti_qs.ti_task,
      ptask->ti_qs.ti_sid);

    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);

    exiting_tasks = 1;
    }  /* END while ((pid = waitpid(-1,&statloc,WNOHANG)) > 0) */

  return;
  }  /* END scan_for_terminated() */
예제 #17
0
/**
 * @brief	wait_action
 *		Wait for a task that has terminated or a socket that is ready to read.
 *		Mark any terminated task as Exiting and do network processing on
 *		any ready socket.
 *
 * @return	void
 */
void
wait_action(void)
{
	static	char	id[] = "wait_action";
	int		rc = 0;
	int		hNum = 0;
	HANDLE		hArray[MAXIMUM_WAIT_OBJECTS+1] = {INVALID_HANDLE_VALUE};
	HANDLE		hProc = INVALID_HANDLE_VALUE;
	extern	HANDLE	hStop;	/* mutex: quit when released */
	int		ecode = -1;
	job		*pjob = NULL;
	task		*ptask = NULL;
	int		waittime = 500;
	extern	int	mom_run_state;
	struct work_task *p_wtask = NULL;
	HANDLE		  pid = INVALID_HANDLE_VALUE;

	/* Check for non job-related tasks like periodic hook tasks */
	while (1) {

		if ((pid = waitpid((HANDLE)-1, &ecode, WNOHANG)) == (HANDLE)-1) {
			if (errno == EINTR) {
				continue;
			} else {
				break;
			}

		} else if (pid == 0) {
			break;
		}

		p_wtask = (struct work_task *)GET_NEXT(task_list_event);
		while (p_wtask) {
			if ((p_wtask->wt_type == WORK_Deferred_Child) &&
				((HANDLE)p_wtask->wt_event == pid)) {
				p_wtask->wt_type = WORK_Deferred_Cmp;
				p_wtask->wt_aux = (int)ecode;	/* exit status */
				svr_delay_entry++;	/* see next_task() */
			}
			p_wtask = (struct work_task *)GET_NEXT(p_wtask->wt_linkall);
		}
	}

	for (;;) {
		hNum = 0;
		if (mom_run_state && hStop != NULL)	/* add mutex to array */
			hArray[hNum++] = hStop;

		pjob = (job *)GET_NEXT(svr_alljobs);
		while (pjob) {
			/*
			 * see if process was a child doing a special
			 * function for MOM
			 */
			if ((pjob->ji_momsubt != NULL) &&
				(pjob->ji_momsubt != INVALID_HANDLE_VALUE) &&
				(pjob->ji_mompost != NULL)) {
				hArray[hNum++] = pjob->ji_momsubt;
			}

			/*
			 * process tasks
			 */
			ptask = (task *)GET_NEXT(pjob->ji_tasks);
			while (ptask) {
				if ((ptask->ti_hProc != NULL) &&
					(ptask->ti_hProc != INVALID_HANDLE_VALUE))
					hArray[hNum++] = ptask->ti_hProc;
				if (hNum > MAXIMUM_WAIT_OBJECTS)
					break;
				ptask = (task *)GET_NEXT(ptask->ti_jobtask);
			}
			if (hNum > MAXIMUM_WAIT_OBJECTS) {
				DBPRT(("%s: %d more than MAX\n", id, hNum))
				hNum = MAXIMUM_WAIT_OBJECTS;
				break;
			}
			pjob = (job *)GET_NEXT(pjob->ji_alljobs);
		}

		if (hNum == 0)		/* nothing to wait for */
			break;

		rc = WaitForMultipleObjects(hNum, hArray,
			FALSE, waittime);
		if (rc == WAIT_TIMEOUT)	/* nobody is done */
			break;
		else if (rc == WAIT_FAILED) {
			log_err(-1, id, "WaitForMultipleObjects");
			break;
		}

		waittime = 0;		/* only wait the first time */
		rc -= WAIT_OBJECT_0;	/* which object was it? */
		assert(0 <= rc && rc < hNum);

		if (rc == 0 && mom_run_state && hStop != NULL) {		/* got mutex */
			mom_run_state = 0;				/* shutdown */
			continue;
		}
		/*
		 **	It was a process finishing.  Find which one.
		 */
		hProc = hArray[rc];

		rc = GetExitCodeProcess(hProc, &ecode);
		if (rc == 0) {
			log_err(-1, id, "GetExitCodeProcess");
			ecode = 99;
		} else if (rc == STILL_ACTIVE)	/* shouldn't happen */
			break;
		CloseHandle(hProc);

		/* find which process finished */
		pjob = (job *)GET_NEXT(svr_alljobs);
		while (pjob) {
			if (pjob->ji_momsubt == hProc)
				break;

			ptask = (task *)GET_NEXT(pjob->ji_tasks);
			while (ptask) {
				if (ptask->ti_hProc == hProc)
					break;
				ptask = (task *)GET_NEXT(ptask->ti_jobtask);
			}
			if (ptask)
				break;
			pjob = (job *)GET_NEXT(pjob->ji_alljobs);
		}
		assert(pjob != NULL);

		if (pjob->ji_momsubt == hProc) {
			pjob->ji_momsubt = NULL;
			if (pjob->ji_mompost) {
				pjob->ji_mompost(pjob, ecode);

				/* After epilogue, get rid of any HOSTFILE */
				if (pjob->ji_mompost == send_obit) {
					char	file[MAXPATHLEN+1];

					(void)sprintf(file, "%s/aux/%s",
						pbs_conf.pbs_home_path,
						pjob->ji_qs.ji_jobid);
					(void)unlink(file);
				}
				pjob->ji_mompost = 0;
			}
			(void)job_save(pjob, SAVEJOB_QUICK);
			continue;
		}

		DBPRT(("%s: task %d pid %d exit value %d\n", id,
			ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid,
			ecode))
		ptask->ti_hProc = NULL;
		ptask->ti_qs.ti_exitstat = ecode;
		ptask->ti_qs.ti_status = TI_STATE_EXITED;
		ptask->ti_qs.ti_sid = 0;
		(void)task_save(ptask);
		sprintf(log_buffer, "task %d terminated", ptask->ti_qs.ti_task);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG,
			pjob->ji_qs.ji_jobid, log_buffer);

		exiting_tasks = 1;
	}

	connection_idlecheck();
}
예제 #18
0
/**
 * update_array_values()
 *
 * updates internal bookeeping values for job arrays
 * @param pa - array to update
 * @param pjob - the pjob that an event happened on
 * @param event - code for what event just happened
 */
void update_array_values(

  job_array            *pa,        /* I */
  void                 *j,         /* I */
  int                   old_state, /* I */
  enum ArrayEventsEnum  event)     /* I */

  {
  job *pjob = (job *)j;
  int exit_status;

  switch (event)
    {
    case aeQueue:

      /* NYI, nothing needs to be done for this yet */

      break;

    case aeRun:

      if (old_state != JOB_STATE_RUNNING)
        {
        pa->ai_qs.jobs_running++;
        pa->ai_qs.num_started++;
        }

      break;

    case aeTerminate:

      exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat;
      if (old_state == JOB_STATE_RUNNING)
        {
        if (pa->ai_qs.jobs_running > 0)
          pa->ai_qs.jobs_running--;
        }

      if (exit_status == 0)
        {
        pa->ai_qs.num_successful++;
        pa->ai_qs.jobs_done++;
        }
      else
        {
        pa->ai_qs.num_failed++;
        pa->ai_qs.jobs_done++;
        }

      array_save(pa);

      /* update slot limit hold if necessary */
      if (server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE)
        {
        /* only need to update if the job wasn't previously held */
        if ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)
          {
          int  i;
          int  newstate;
          int  newsub;
          job *pj;

          /* find the first held job and release its hold */
          for (i = 0; i < pa->ai_qs.array_size; i++)
            {
            if (pa->jobs[i] == NULL)
              continue;

            pj = (job *)pa->jobs[i];

            if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
              {
              pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;

              if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                {
                pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                }
             
              svr_evaljobstate(pj, &newstate, &newsub, 1);
              svr_setjobstate(pj, newstate, newsub);
              job_save(pj, SAVEJOB_FULL, 0);

              break;
              }
            }
          }
        }

      break;

    default:

      /* log error? */

      break;
    }

  set_array_depend_holds(pa);
  array_save(pa);

  } /* END update_array_values() */
예제 #19
0
void
req_orderjob(struct batch_request *req)
{
	int      jt1, jt2;            /* job type */
	job	*pjob;
	job	*pjob1;
	job	*pjob2;
	long	 rank;
	int	 rc;
	char	 tmpqn[PBS_MAXQUEUENAME+1];

	if ((pjob1=chk_job_request(req->rq_ind.rq_move.rq_jid, req, &jt1)) == NULL)
		return;
	if ((pjob2=chk_job_request(req->rq_ind.rq_move.rq_destin, req, &jt2)) == NULL)
		return;
	if ((jt1 == IS_ARRAY_Single) || (jt2 == IS_ARRAY_Single) ||
		(jt1 == IS_ARRAY_Range)  || (jt2 == IS_ARRAY_Range)) {
		/* can only move regular or Array Job, not Subjobs */
		req_reject(PBSE_IVALREQ, 0, req);
		return;
	}

	if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
		((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
		((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_BEGUN)   ||
		((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_BEGUN)) {
#ifndef NDEBUG
		(void)sprintf(log_buffer, "(%s) %s, state=%d",
			__func__, msg_badstate, pjob->ji_qs.ji_state);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_DEBUG,
			pjob->ji_qs.ji_jobid, log_buffer);
#endif	/* NDEBUG */
		req_reject(PBSE_BADSTATE, 0, req);
		return;
	} else if (pjob1->ji_qhdr != pjob2->ji_qhdr) {

		/* Jobs are in different queues */

		if ((rc = svr_chkque(pjob1, pjob2->ji_qhdr,
			get_hostPart(pjob1->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str),
			MOVE_TYPE_Order)) ||
			(rc = svr_chkque(pjob2, pjob1->ji_qhdr,
			get_hostPart(pjob2->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str),
			MOVE_TYPE_Order))) {
			req_reject(rc, 0, req);
			return;
		}
	}

	/* now swap the order of the two jobs in the queue lists */

	rank = pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
	pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long =
		pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
	pjob1->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_MODCACHE;
	pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = rank;
	pjob2->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_MODCACHE;

	if (pjob1->ji_qhdr != pjob2->ji_qhdr) {
		(void)strcpy(tmpqn, pjob1->ji_qs.ji_queue);
		(void)strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
		(void)strcpy(pjob2->ji_qs.ji_queue, tmpqn);
		svr_dequejob(pjob1);
		svr_dequejob(pjob2);
		(void)svr_enquejob(pjob1);
		(void)svr_enquejob(pjob2);

	} else {
		swap_link(&pjob1->ji_jobque,  &pjob2->ji_jobque);
		swap_link(&pjob1->ji_alljobs, &pjob2->ji_alljobs);
	}

	/* need to update disk copy of both jobs to save new order */

	(void)job_save(pjob1, SAVEJOB_FULL);
	(void)job_save(pjob2, SAVEJOB_FULL);

	reply_ack(req);
}
예제 #20
0
void *req_checkpointjob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  pbs_attribute *pattr;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory");
      }

    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        pjob = NULL;
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  return(NULL);
  }  /* END req_checkpointjob() */
예제 #21
0
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
예제 #22
0
파일: job_recov.c 프로젝트: CESNET/torque
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  int    qs_upgrade;
#ifndef PBS_MOM
  char   parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  qs_upgrade = FALSE;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  strcpy(namebuf, path_jobs); /* job directory path */

  strcat(namebuf, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    sprintf(log_buffer, "unable to open %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    sprintf(log_buffer, "Unable to read %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    sprintf(log_buffer,
            "%s appears to be from an old version. Attempting to convert.\n",
            namebuf);
    log_err(-1, "job_recov", log_buffer);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      sprintf(log_buffer, "unable to upgrade %s\n", namebuf);

      log_err(-1, "job_recov", log_buffer);

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    qs_upgrade = TRUE;
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    sprintf(log_buffer, "Job Id %s does not match file name for %s",
            pj->ji_qs.ji_jobid,
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        (int)JOB_ATR_LAST,
        (int)JOB_ATR_UNKN,
        TRUE) != 0)
    {
    sprintf(log_buffer, "unable to recover %s (file is likely corrupted)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    job_free(pj);

    close(fds);

    return(NULL);
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_roottask(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: root task not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_jobflags(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: job flags not recovered from %s (written by an older pbs_mom?)", namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

#else /* PBS_MOM */

  if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_isparent = TRUE;
      }
    else
      {
      if (pa == NULL)
        {
        /* couldn't find array struct, it must not have been recovered,
        treat job as indepentent job?  perhaps we should delete the job
        XXX_JOB_ARRAY: should I unset this?*/
        pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags &= ~ATR_VFLAG_SET;
        }
      else
        {
        CLEAR_LINK(pj->ji_arrayjobs);
        append_link(&pa->array_alljobs, &pj->ji_arrayjobs, (void*)pj);
        pj->ji_arraystruct = pa;
        pa->jobs_recovered++;
        }
      }
    }

#endif

  close(fds);

  /* all done recovering the job */

  if (qs_upgrade == TRUE)
    {
    job_save(pj, SAVEJOB_FULL);
    }

  return(pj);
  }  /* END job_recov() */
예제 #23
0
int setup_array_struct(job *pjob)
  {
  job_array *pa;

  /* struct work_task *wt; */
  array_request_node *rn;
  int bad_token_count;
  int array_size;
  int rc;

  /* setup a link to this job array in the servers all_arrays list */
  pa = (job_array *)calloc(1,sizeof(job_array));

  pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION;
  
  pa->template_job = pjob;

  /*pa->ai_qs.array_size  = pjob->ji_wattr[(int)JOB_ATR_job_array_size].at_val.at_long;*/

  strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid);
  strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix);
  strncpy(pa->ai_qs.owner, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, PBS_MAXUSER + PBS_MAXSERVERNAME + 2);
  strncpy(pa->ai_qs.submit_host, get_variable(pjob, pbs_o_host), PBS_MAXSERVERNAME);

  pa->ai_qs.num_cloned = 0;
  CLEAR_LINK(pa->all_arrays);
  CLEAR_HEAD(pa->request_tokens);
  append_link(&svr_jobarrays, &pa->all_arrays, (void*)pa);

 if (job_save(pjob, SAVEJOB_FULL, 0) != 0)
    {
    job_purge(pjob);


    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
        "cannot save job");
      }

    return 1;
    }

  if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa)))
    {
    array_delete(pa);

    snprintf(log_buffer,sizeof(log_buffer),
      "Array %s requested a slot limit above the max limit %ld, rejecting\n",
      pa->ai_qs.parent_id,
      server.sv_attr[SRV_ATR_MaxSlotLimit].at_val.at_long);
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      pa->ai_qs.parent_id,
      log_buffer);

    return(INVALID_SLOT_LIMIT);
    }

  pa->ai_qs.jobs_running = 0;
  pa->ai_qs.num_started = 0;
  pa->ai_qs.num_failed = 0;
  pa->ai_qs.num_successful = 0;
  
  bad_token_count =

    parse_array_request(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str,
                        &(pa->request_tokens));

  /* get the number of elements that should be allocated in the array */
  rn = (array_request_node *)GET_NEXT(pa->request_tokens);
  array_size = 0;
  pa->ai_qs.num_jobs = 0;
  while (rn != NULL) 
    {
    if (rn->end > array_size)
      array_size = rn->end;
    /* calculate the actual number of jobs (different from array size) */
    pa->ai_qs.num_jobs += rn->end - rn->start + 1;

    rn = (array_request_node *)GET_NEXT(rn->request_tokens_link);
    }

  /* size of array is the biggest index + 1 */
  array_size++; 

  if (server.sv_attr[SRV_ATR_MaxArraySize].at_flags & ATR_VFLAG_SET)
    {
    int max_array_size = server.sv_attr[SRV_ATR_MaxArraySize].at_val.at_long;
    if (max_array_size < pa->ai_qs.num_jobs)
      {
      array_delete(pa);

      return(ARRAY_TOO_LARGE);
      }
    }

  /* initialize the array */
  pa->jobs = malloc(array_size * sizeof(job *));
  memset(pa->jobs,0,array_size * sizeof(job *));

  /* remember array_size */
  pa->ai_qs.array_size = array_size;

  CLEAR_HEAD(pa->ai_qs.deps);

  array_save(pa);

  if (bad_token_count > 0)
    {
    array_delete(pa);
    return 2;
    }

  return 0;

  }
예제 #24
0
int send_job(

  job       *jobp,
  pbs_net_t  hostaddr, /* host address, host byte order */
  int        port, /* service port, host byte order */
  int        move_type, /* move, route, or execute */
  void (*post_func)(struct work_task *),     /* after move */
  void      *data)  /* ptr to optional batch_request to be put */
                    /* in the work task structure */

  {
  tlist_head  attrl;
  enum conn_type cntype = ToServerDIS;
  int    con;
  char  *destin = jobp->ji_qs.ji_destin;
  int    encode_type;
  int    i;
  int    NumRetries;

  char  *id = "send_job";

  attribute *pattr;

  pid_t  pid;

  struct attropl *pqjatr;      /* list (single) of attropl for quejob */
  char  *safail = "sigaction failed\n";
  char  *spfail = "sigprocmask failed\n";
  char   script_name[MAXPATHLEN + 1];
  sigset_t  child_set, all_set;

  struct  sigaction child_action;

  struct work_task *ptask;

  mbool_t        Timeout = FALSE;

  char          *pc;

  sigemptyset(&child_set);
  sigaddset(&child_set, SIGCHLD);
  sigfillset(&all_set);

  /* block SIGCHLD until work task is established */

  if (sigprocmask(SIG_BLOCK, &child_set, NULL) == -1)
    {
    log_err(errno,id,spfail);

    pbs_errno = PBSE_SYSTEM;

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "cannot set signal mask");

    return(ROUTE_PERM_FAILURE);
    }

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"about to send job - type=%d",
      move_type);
 
    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "forking in send_job");
    }

  pid = fork();

  if (pid == -1)
    {
    /* error on fork */

    log_err(errno, id, "fork failed\n");

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    pbs_errno = PBSE_SYSTEM;

    return(ROUTE_PERM_FAILURE);
    }

  if (pid != 0)
    {
    /* The parent (main server) */

    /* create task to monitor job startup */

    /* CRI:   need way to report to scheduler job is starting, not started */

    ptask = set_task(WORK_Deferred_Child, pid, post_func, jobp);

    if (ptask == NULL)
      {
      log_err(errno, id, msg_err_malloc);

      return(ROUTE_PERM_FAILURE);
      }

    ptask->wt_parm2 = data;

    append_link(
      &((job *)jobp)->ji_svrtask,
      &ptask->wt_linkobj,
      ptask);

    /* now can unblock SIGCHLD */

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    if (LOGLEVEL >= 1)
      {
      extern long   DispatchTime[];
      extern job   *DispatchJob[];
      extern char  *DispatchNode[];

      extern time_t time_now;

      struct pbsnode *NP;

      /* record job dispatch time */

      int jindex;

      for (jindex = 0;jindex < 20;jindex++)
        {
        if (DispatchJob[jindex] == NULL)
          {
          DispatchTime[jindex] = time_now;

          DispatchJob[jindex] = jobp;

          if ((NP = PGetNodeFromAddr(hostaddr)) != NULL)
            DispatchNode[jindex] = NP->nd_name;
          else
            DispatchNode[jindex] = NULL;

          break;
          }
        }
      }

    /* SUCCESS */

    return(ROUTE_DEFERRED);
    }  /* END if (pid != 0) */

  /*
   * the child process
   *
   * set up signal catcher for error return
   */

  rpp_terminate();

  child_action.sa_handler = net_move_die;

  sigfillset(&child_action.sa_mask);

  child_action.sa_flags = 0;

  if (sigaction(SIGHUP, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGINT, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGQUIT, &child_action, NULL))
    log_err(errno, id, safail);

  /* signal handling is set, now unblock */

  if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
    log_err(errno, id, spfail);

  /* encode job attributes to be moved */

  CLEAR_HEAD(attrl);

  /* select attributes/resources to send based on move type */

  if (move_type == MOVE_TYPE_Exec)
    {
    /* moving job to MOM - ie job start */

    resc_access_perm = ATR_DFLAG_MOM;
    encode_type = ATR_ENCODE_MOM;
    cntype = ToServerDIS;
    }
  else
    {
    /* moving job to alternate server? */

    resc_access_perm =
      ATR_DFLAG_USWR |
      ATR_DFLAG_OPWR |
      ATR_DFLAG_MGWR |
      ATR_DFLAG_SvRD;

    encode_type = ATR_ENCODE_SVR;

    /* clear default resource settings */

    svr_dequejob(jobp);
    }

  pattr = jobp->ji_wattr;

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    if (((job_attr_def + i)->at_flags & resc_access_perm) ||
      ((strncmp((job_attr_def + i)->at_name,"session_id",10) == 0) &&
      (jobp->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)))
      {
      (job_attr_def + i)->at_encode(
        pattr + i,
        &attrl,
        (job_attr_def + i)->at_name,
        NULL,
        encode_type);
      }
    }    /* END for (i) */

  attrl_fixlink(&attrl);

  /* put together the job script file name */

  strcpy(script_name, path_jobs);

  if (jobp->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    strcat(script_name, jobp->ji_arraystruct->ai_qs.fileprefix);
    }
  else
    {
    strcat(script_name, jobp->ji_qs.ji_fileprefix);
    }

  strcat(script_name, JOB_SCRIPT_SUFFIX);


  pbs_errno = 0;
  con = -1;

  for (NumRetries = 0;NumRetries < RETRY;NumRetries++)
    {
    int rc;

    /* connect to receiving server with retries */

    if (NumRetries > 0)
      {
      /* recycle after an error */

      if (con >= 0)
        svr_disconnect(con);

      /* check pbs_errno from previous attempt */

      if (should_retry_route(pbs_errno) == -1)
        {
        sprintf(log_buffer, "child failed in previous commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_err(pbs_errno, id, log_buffer);

        exit(1); /* fatal error, don't retry */
        }

      sleep(1 << NumRetries);
      }

    /* NOTE:  on node hangs, svr_connect is successful */

    if ((con = svr_connect(hostaddr, port, 0, cntype)) == PBS_NET_RC_FATAL)
      {
      sprintf(log_buffer, "send_job failed to %lx port %d",
        hostaddr,
        port);

      log_err(pbs_errno, id, log_buffer);

      exit(1);
      }

    if (con == PBS_NET_RC_RETRY)
      {
      pbs_errno = 0; /* should retry */

      continue;
      }

    /*
     * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
     * we are recovering after being down or a late failure, we
     * just want to send the "ready-to-commit/commit"
     */

    if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM)
      {
      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT)
        {
        jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;

        job_save(jobp, SAVEJOB_QUICK);
        }

      pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;

      if ((pc = PBSD_queuejob(
                  con,
                  jobp->ji_qs.ji_jobid,
                  destin,
                  pqjatr,
                  NULL)) == NULL)
        {
        if ((pbs_errno == PBSE_EXPIRED) || (pbs_errno == PBSE_READ_REPLY_TIMEOUT))
          {
          /* queue job timeout based on pbs_tcp_timeout */

          Timeout = TRUE;
          }

        if ((pbs_errno == PBSE_JOBEXIST) && (move_type == MOVE_TYPE_Exec))
          {
          /* already running, mark it so */

          log_event(
            PBSEVENT_ERROR,
            PBS_EVENTCLASS_JOB,
            jobp->ji_qs.ji_jobid,
            "MOM reports job already running");

          exit(0);
          }

        sprintf(log_buffer, "send of job to %s failed error = %d",
          destin,
          pbs_errno);

        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          jobp->ji_qs.ji_jobid,
          log_buffer);

        continue;
        }  /* END if ((pc = PBSD_queuejob() == NULL) */

      free(pc);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)
        {
        if (PBSD_jscript(con, script_name, jobp->ji_qs.ji_jobid) != 0)
          continue;
        }

      /* XXX may need to change the logic below, if we are sending the job to
         a mom on the same host and the mom and server are not sharing the same
         spool directory, then we still need to move the file */

      if ((move_type == MOVE_TYPE_Exec) &&
          (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
          (hostaddr != pbs_server_addr))
        {
        /* send files created on prior run */

        if ((move_job_file(con,jobp,StdOut) != 0) ||
            (move_job_file(con,jobp,StdErr) != 0) ||
            (move_job_file(con,jobp,Checkpoint) != 0))
          {
          continue;
          }
        }

      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;

      job_save(jobp, SAVEJOB_QUICK);
      }
    else
      {
      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");
      }

    if (PBSD_rdytocmt(con, jobp->ji_qs.ji_jobid) != 0)
      {
      if (sigprocmask(SIG_UNBLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      continue;
      }


    if ((rc = PBSD_commit(con, jobp->ji_qs.ji_jobid)) != 0)
      {
      int errno2;

      /* NOTE:  errno is modified by log_err */

      errno2 = errno;

      sprintf(log_buffer, "send_job commit failed, rc=%d (%s)",
              rc,
              (connection[con].ch_errtxt != NULL) ? connection[con].ch_errtxt : "N/A");

      log_ext(errno2, id, log_buffer, LOG_WARNING);

      /* if failure occurs, pbs_mom should purge job and pbs_server should set *
         job state to idle w/error msg */

      if (errno2 == EINPROGRESS)
        {
        /* request is still being processed */

        /* increase tcp_timeout in qmgr? */

        Timeout = TRUE;

        /* do we need a continue here? */

        sprintf(log_buffer, "child commit request timed-out for job %s, increase tcp_timeout?",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_WARNING);

        /* don't retry on timeout--break out and report error! */

        break;
        }
      else
        {
        sprintf(log_buffer, "child failed in commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_CRIT);

        /* FAILURE */

        exit(1);
        }
      }    /* END if ((rc = PBSD_commit(con,jobp->ji_qs.ji_jobid)) != 0) */

    svr_disconnect(con);

    /* child process is done */

    /* SUCCESS */

    exit(0);
    }  /* END for (NumRetries) */

  if (con >= 0)
    svr_disconnect(con);

  if (Timeout == TRUE)
    {
    /* 10 indicates that job migrate timed out, server will mark node down *
          and abort the job - see post_sendmom() */

    sprintf(log_buffer, "child timed-out attempting to start job %s",
            jobp->ji_qs.ji_jobid);

    log_ext(pbs_errno, id, log_buffer, LOG_WARNING);

    exit(10);
    }

  if (should_retry_route(pbs_errno) == -1)
    {
    sprintf(log_buffer, "child failed and will not retry job %s",
      jobp->ji_qs.ji_jobid);

    log_err(pbs_errno, id, log_buffer);

    exit(1);
    }

  exit(2);

  /*NOTREACHED*/

  return(ROUTE_SUCCESS);
  }  /* END send_job() */
예제 #25
0
static int local_move(

  job                  *jobp,
  struct batch_request *req)

  {
  char   *id = "local_move";
  pbs_queue *qp;
  char   *destination = jobp->ji_qs.ji_destin;
  int    mtype;

  /* search for destination queue */

  if ((qp = find_queuebyname(destination)) == NULL)
    {
    sprintf(log_buffer, "queue %s does not exist\n",
            destination);

    log_err(-1, id, log_buffer);

    pbs_errno = PBSE_UNKQUE;

    return(ROUTE_PERM_FAILURE);
    }

  /*
   * if being moved at specific request of administrator, then
   * checks on queue availability, etc. are skipped;
   * otherwise all checks are enforced.
   */

  if (req == 0)
    {
    mtype = MOVE_TYPE_Route; /* route */
    }
  else if (req->rq_perm & (ATR_DFLAG_MGRD | ATR_DFLAG_MGWR))
    {
    mtype = MOVE_TYPE_MgrMv; /* privileged move */
    }
  else
    {
    mtype = MOVE_TYPE_Move; /* non-privileged move */
    }

  if ((pbs_errno = svr_chkque(
                     jobp,
                     qp,
                     get_variable(jobp, pbs_o_host), mtype, NULL)))
    {
    /* should this queue be retried? */

    return(should_retry_route(pbs_errno));
    }

  /* dequeue job from present queue, update destination and */
  /* queue_rank for new queue and enqueue into destination  */

  svr_dequejob(jobp);

  strcpy(jobp->ji_qs.ji_queue, destination);

  jobp->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank;

  pbs_errno = svr_enquejob(jobp);

  if (pbs_errno != 0)
    {
    return(ROUTE_PERM_FAILURE); /* should never ever get here */
    }

  jobp->ji_lastdest = 0; /* reset in case of another route */

  job_save(jobp, SAVEJOB_FULL);

  return(ROUTE_SUCCESS);
  }  /* END local_move() */
예제 #26
0
void req_rdytocommit(

  struct batch_request *preq)  /* I */

  {
  job *pj;
  int  sock = preq->rq_conn;

  int  OrigState;
  int  OrigSState;
  char OrigSChar;
  long OrigFlags;

  pj = locate_new_job(sock, preq->rq_ind.rq_rdytocommit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "ready to commit job");
    }

  if (pj == NULL)
    {
    log_err(errno, "req_rdytocommit", "unknown job id");

    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    /* FAILURE */

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN)
    {
    log_err(errno, "req_rdytocommit", "cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    /* FAILURE */

    return;
    }

  OrigState  = pj->ji_qs.ji_state;

  OrigSState = pj->ji_qs.ji_substate;
  OrigSChar  = pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char;
  OrigFlags  = pj->ji_wattr[(int)JOB_ATR_state].at_flags;

  pj->ji_qs.ji_state    = JOB_STATE_TRANSIT;
  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM;
  pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = 'T';
  pj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET;

  if (job_save(pj, SAVEJOB_NEW) == -1)
    {
    char tmpLine[1024];

    sprintf(tmpLine, "cannot save job - errno=%d - %s",
            errno,
            strerror(errno));

    log_err(errno, "req_rdytocommit", tmpLine);

    /* commit failed, backoff state changes */

    pj->ji_qs.ji_state    = OrigState;
    pj->ji_qs.ji_substate = OrigSState;
    pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = OrigSChar;
    pj->ji_wattr[(int)JOB_ATR_state].at_flags = OrigFlags;

    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);

    /* FAILURE */

    return;
    }

  /* acknowledge the request with the job id */

  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_RdytoCom) != 0)
    {
    /* reply failed, purge the job and close the connection */

    sprintf(log_buffer, "cannot report jobid - errno=%d - %s",
            errno,
            strerror(errno));

    log_err(errno, "req_rdytocommit", log_buffer);

    close_conn(sock);

    job_purge(pj);

    /* FAILURE */

    return;
    }

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "ready to commit job completed");
    }

  return;
  }  /* END req_rdytocommit() */
예제 #27
0
int req_orderjob(

  struct batch_request *vp) /* I */

  {
  job                  *pjob;
  job                  *pjob1;
  job                  *pjob2;
  int                   rank;
  int                   rc = 0;
  char                  tmpqn[PBS_MAXQUEUENAME+1];
  struct batch_request *req = (struct batch_request *)vp;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue            *pque1;
  pbs_queue            *pque2;

  if ((pjob1 = chk_job_request(req->rq_ind.rq_move.rq_jid, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job1_mutex(pjob1->ji_mutex, true);

  if ((pjob2 = chk_job_request(req->rq_ind.rq_move.rq_destin, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job2_mutex(pjob2->ji_mutex, true);

  if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
      ((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d",
            pbse_to_txt(PBSE_BADSTATE),
            pjob->ji_qs.ji_state);

    strcat(log_buf, __func__);

    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }
  else if ((pjob1->ji_qhdr == NULL) || (pjob2->ji_qhdr == NULL))
    {
    req_reject(PBSE_BADSTATE, 0, req, NULL, "One of the jobs does not have a queue");
    return(PBSE_NONE);
    }
  else if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    /* jobs are in different queues */
    int ok = FALSE;

    if ((pque2 = get_jobs_queue(&pjob2)) == NULL)
      {
      rc = PBSE_BADSTATE;
      job2_mutex.set_lock_on_exit(false);
      }
    else
      {
      mutex_mgr pque2_mutex = mutex_mgr(pque2->qu_mutex, true);
      if ((rc = svr_chkque(pjob1, pque2, get_variable(pjob1, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
        {
        pque2_mutex.unlock();

        if ((pque1 = get_jobs_queue(&pjob1)) == NULL)
          {
          rc = PBSE_BADSTATE;
          job1_mutex.set_lock_on_exit(false);
          }
        else if (pjob1 != NULL)
          {
          mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
          if ((rc = svr_chkque(pjob2, pque1, get_variable(pjob2, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
            {
            ok = TRUE;
            }
          }
        }
      }

    if (ok == FALSE)
      {
      req_reject(rc, 0, req, NULL, NULL);

      return(PBSE_NONE);
      }
    }

  /* now swap the order of the two jobs in the queue lists */
  rank = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long =
    pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long = rank;

  if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    strcpy(tmpqn, pjob1->ji_qs.ji_queue);
    strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
    strcpy(pjob2->ji_qs.ji_queue, tmpqn);

    svr_dequejob(pjob1, FALSE);
    svr_dequejob(pjob2, FALSE);

    if (svr_enquejob(pjob1, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob1 = NULL;
      job1_mutex.set_lock_on_exit(false);
      }

    if (svr_enquejob(pjob2, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob2 = NULL;
      job2_mutex.set_lock_on_exit(false);
      }
    }
  else
    {
    if ((pque1 = get_jobs_queue(&pjob1)) != NULL)
      {
      mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
      swap_jobs(pque1->qu_jobs,pjob1,pjob2);
      swap_jobs(NULL,pjob1,pjob2);
      }
    }

  /* need to update disk copy of both jobs to save new order */
  if (pjob1 != NULL)
    {
    job_save(pjob1, SAVEJOB_FULL, 0);
    }

  if (pjob2 != NULL)
    {
    job_save(pjob2, SAVEJOB_FULL, 0);
    }

  /* SUCCESS */
  reply_ack(req);

  return(PBSE_NONE);
  }  /* END req_orderjob() */
예제 #28
0
void req_commit(

  struct batch_request *preq)  /* I */

  {
  job   *pj;

  pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "committing job");
    }

  if (pj == NULL)
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    log_err(errno, "req_commit", "cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* move job from new job list to "all" job list, set to running state */

  delete_link(&pj->ji_alljobs);

  append_link(&svr_alljobs, &pj->ji_alljobs, pj);

  /*
  ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
  */

  pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

  pj->ji_qs.ji_state = JOB_STATE_RUNNING;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn);

  pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

  /* For MOM - start up the job (blocks) */

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "starting job execution");
    }

  start_exec(pj);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "job execution started");
    }

  /* if start request fails, reply with failure string */

  if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
    char tmpLine[1024];

    if ((pj->ji_hosts != NULL) &&
        (pj->ji_nodekill >= 0) &&
        (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
      {
      sprintf(tmpLine, "start failed on node %s",
              pj->ji_hosts[pj->ji_nodekill].hn_host);
      }
    else
      {
      sprintf(tmpLine, "start failed on unknown node");
      }

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
        tmpLine);
      }

    reply_text(preq, 0, tmpLine);
    }
  else
    {
    reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit);
    }

  job_save(pj, SAVEJOB_FULL);

  /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
   * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
   * pbs_server got these attr values.  This worked fine before TORQUE modified
   * job launched into an async process.  At 2.0.0p6, a new attribute "SEND" flag
   * was added to handle this process. */

  return;
  }  /* END req_commit() */
예제 #29
0
파일: req_delete.c 프로젝트: dhill12/test
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
예제 #30
0
void
req_register(struct batch_request *preq)
{
	int		   made;
	attribute	  *pattr;
	struct depend	  *pdep;
	struct depend_job *pdj;
	job		  *pjob;
	char		  *ps;
	struct work_task  *ptask;
	int		   rc = 0;
	int		   revtype;
	int		   type;
	int		   savetype = SAVEJOB_FULL;

	/*  make sure request is from a server */

	if (!preq->rq_fromsvr) {
#ifdef NAS /* localmod 109 */
		sprintf(log_buffer, "Dependency request not from server");
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
			preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	/* find the "parent" job specified in the request */

	if ((pjob = find_job(preq->rq_ind.rq_register.rq_parent)) == NULL) {

		/*
		 * job not found... if server is initializing, it may not
		 * yet recovered, that is not an error.
		 */

		if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long != SV_STATE_INIT) {
			log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
				preq->rq_ind.rq_register.rq_parent,
				msg_unkjobid);
			req_reject(PBSE_UNKJOBID, 0, preq);
		} else {
			reply_ack(preq);
		}
		return;
	}

	pattr = &pjob->ji_wattr[(int)JOB_ATR_depend];
	type = preq->rq_ind.rq_register.rq_dependtype;
	pjob->ji_modified = 1;

	/* more of the server:port fix kludge */

	ps = strchr(preq->rq_ind.rq_register.rq_child, (int)'@');
	if (ps != NULL) {
		(void)strcpy(preq->rq_ind.rq_register.rq_svr, ps+1);
		*ps = '\0';
	} else {
		(void)strcpy(preq->rq_ind.rq_register.rq_svr, preq->rq_host);
	}

	if (pjob->ji_qs.ji_state == JOB_STATE_MOVED) {
		snprintf(log_buffer, sizeof(log_buffer), "Parent %s%s", msg_movejob, pjob->ji_qs.ji_destin);
		log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR,
			PBS_EVENTCLASS_REQUEST, LOG_INFO,
			preq->rq_ind.rq_register.rq_child, log_buffer);
		req_reject(PBSE_JOB_MOVED, 0, preq);
		return;
	}
	switch (preq->rq_ind.rq_register.rq_op) {

			/*
			 * Register a dependency
			 */

		case JOB_DEPEND_OP_REGISTER:
			switch (type) {

				case JOB_DEPEND_TYPE_AFTERSTART:
					if (pjob->ji_qs.ji_substate >= JOB_SUBSTATE_RUNNING) {
						/* job already running, setup task to send	*/
						/* release back to child and continue with	*/
						/* registration process 			*/
						ptask = set_task(WORK_Immed, 0, post_run_depend,
							(void *)pjob);
						if (ptask)
							append_link(&pjob->ji_svrtask,
								&ptask->wt_linkobj, ptask);
					}
					/* fall through to complete registration */
				case JOB_DEPEND_TYPE_AFTERANY:
				case JOB_DEPEND_TYPE_AFTEROK:
				case JOB_DEPEND_TYPE_AFTERNOTOK:
					rc = register_dep(pattr, preq, type, &made);
					break;

				case JOB_DEPEND_TYPE_BEFORESTART:
				case JOB_DEPEND_TYPE_BEFOREANY:
				case JOB_DEPEND_TYPE_BEFOREOK:
				case JOB_DEPEND_TYPE_BEFORENOTOK:

					/*
					 * Check job owner for permission, use the real
					 * job owner, not the sending server's name.
					 */

					(void)strcpy(preq->rq_user,
						preq->rq_ind.rq_register.rq_owner);
					if (svr_chk_owner(preq, pjob)) {
						rc = PBSE_PERM;		/* not same user */
					} else {
						/* ok owner, see if job has "on" */
						pdep = find_depend(JOB_DEPEND_TYPE_ON, pattr);
						if (pdep == 0) {
							/* on "on", see if child already registered */
							revtype = type ^ (JOB_DEPEND_TYPE_BEFORESTART -
								JOB_DEPEND_TYPE_AFTERSTART);
							pdep = find_depend(revtype, pattr);
							if (pdep == 0) {
								/* no "on" and no prior - return error */
								rc = PBSE_BADDEPEND;
							} else {
								pdj = find_dependjob(pdep,
									preq->rq_ind.rq_register.rq_child);
								if (pdj) {
									/* has prior register, update it */
									(void)strcpy(pdj->dc_svr,
										preq->rq_ind.rq_register.rq_svr);
								}
							}
						} else if ((rc=register_dep(pattr, preq, type, &made)) == 0) {
							if (made) {	/* first time registered */
								if (--pdep->dp_numexp <= 0)
									del_depend(pdep);
							}
						}
					}
					break;

				default:
#ifdef NAS /* localmod 109 */
					sprintf(log_buffer, "Unknown dep. op: %d", preq->rq_ind.rq_register.rq_op);
					log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
						preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
					rc = PBSE_IVALREQ;
					break;
			}
			break;

			/*
			 * Release a dependency so job might run
			 */

		case JOB_DEPEND_OP_RELEASE:
			switch (type) {

				case JOB_DEPEND_TYPE_BEFORESTART:
				case JOB_DEPEND_TYPE_BEFOREANY:
				case JOB_DEPEND_TYPE_BEFOREOK:
				case JOB_DEPEND_TYPE_BEFORENOTOK:

					/* predecessor sent release-reduce "on", */
					/* see if this job can now run 		 */

					type ^= (JOB_DEPEND_TYPE_BEFORESTART -
						JOB_DEPEND_TYPE_AFTERSTART);
					if ((pdep = find_depend(type, pattr)) != NULL) {
						pdj = find_dependjob(pdep,
							preq->rq_ind.rq_register.rq_child);
						if (pdj) {
							del_depend_job(pdj);
							pattr->at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE;
							savetype = SAVEJOB_FULLFORCE;
							(void)sprintf(log_buffer, msg_registerrel,
								preq->rq_ind.rq_register.rq_child);
							log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
								LOG_INFO,
								pjob->ji_qs.ji_jobid, log_buffer);

							if (GET_NEXT(pdep->dp_jobs) == 0) {
								/* no more dependencies of this type */
								del_depend(pdep);
								set_depend_hold(pjob, pattr);
							}
							break;
						}
#ifdef NAS /* localmod 109 */
						sprintf(log_buffer, "Dep.rls. job not found: %d/%s", type, preq->rq_ind.rq_register.rq_child);
					} else {
						sprintf(log_buffer, "Dep.rls. type not found: %d", type);
#endif /* localmod 109 */
					}
#ifdef NAS /* localmod 109 */
					log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
						preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
					rc = PBSE_IVALREQ;
					break;

			}

			break;

		case JOB_DEPEND_OP_READY:
			rc = PBSE_NOSYNCMSTR;
			break;

		case JOB_DEPEND_OP_DELETE:
			(void)sprintf(log_buffer, msg_registerdel,
				preq->rq_ind.rq_register.rq_child);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
			job_abt(pjob, log_buffer);
			break;

		case JOB_DEPEND_OP_UNREG:
			unregister_dep(pattr, preq);
			set_depend_hold(pjob, pattr);
			break;


		default:
			sprintf(log_buffer, msg_illregister,
				preq->rq_ind.rq_register.rq_parent);
			log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR,
				PBS_EVENTCLASS_REQUEST, LOG_INFO,
				preq->rq_host, log_buffer);
			rc = PBSE_IVALREQ;
			break;;
	}

	if (rc) {
		pjob->ji_modified = 0;
		req_reject(rc, 0, preq);
	} else {
		/* If this is an array job, forcibly save it to ensure
		 * dependencies are recorded.
		 */
		if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob)
			savetype = SAVEJOB_FULLFORCE;
		if (pjob->ji_modified)
			(void)job_save(pjob, savetype);
		reply_ack(preq);
	}
	return;
}