Пример #1
0
static void
post_chkpt(struct work_task *ptask)
{
	job		     *pjob;
	struct batch_request *preq;

	preq = (struct batch_request *)ptask->wt_parm1;
	pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);
	if (!preq || !pjob)
		return;
	if (preq->rq_reply.brp_code == 0) {
		/* checkpointed ok */
		if (preq->rq_reply.brp_auxcode) { /* chkpt can be moved */
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_ChkptMig;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
		}
		account_record(PBS_ACCT_CHKPNT, pjob, (char *)0);
	} else {
		/* need to try rerun if possible or just abort the job */
		if (preq->rq_reply.brp_code != PBSE_CKPBSY) {
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
			if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
				rerun_or_kill(pjob, msg_on_shutdown);
		}
	}

	release_req(ptask);
}
Пример #2
0
void post_checkpoint(

  batch_request *preq)

  {
  job *pjob;

  if (preq == NULL)
    return;

  pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE);

  if (preq->rq_reply.brp_code == 0)
    {
    /* checkpointed ok */
    if ((preq->rq_reply.brp_auxcode) && (pjob != NULL)) /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags =
        (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) |
        JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;

      }
    }
  else
    {
    /* need to try rerun if possible or just abort the job */

    if (pjob)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        rerun_or_kill(&pjob, msg_on_shutdown);
      }
    }

  free_br(preq);

  if (pjob != NULL)
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
  }  /* END post_checkpoint() */
Пример #3
0
static void post_checkpoint(

  struct work_task *ptask)

  {
  job                  *pjob;

  struct batch_request *preq;

  preq = (struct batch_request *)ptask->wt_parm1;
  pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);

  if (preq->rq_reply.brp_code == 0)
    {
    /* checkpointed ok */
    if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags =
        (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) |
        JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;

      }
    }
  else
    {
    /* need to try rerun if possible or just abort the job */

    if (pjob)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        rerun_or_kill(pjob, msg_on_shutdown);
      }
    }

  release_req(ptask);
  }  /* END post_checkpoint() */
Пример #4
0
void
svr_shutdown(int type)
{
	attribute	  *pattr;
	job		  *pjob;
	job		  *pnxt;
	long		 *state;
	int		  wait_for_secondary = 0;

	/* Lets start by logging shutdown and saving everything */

	state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long;
	(void)strcpy(log_buffer, msg_shutdown_start);

	if (*state == SV_STATE_SHUTIMM) {

		/* if already shuting down, another Immed/sig will force it */

		if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) {
			*state = SV_STATE_DOWN;
			(void)strcat(log_buffer, "Forced");
			log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
				PBS_EVENTCLASS_SERVER, LOG_NOTICE,
				msg_daemonname, log_buffer);
			return;
		}
	}

	/* in failover environments, need to communicate with Secondary */
	/* and for these two where the Primary is going down, mark to   */
	/* wait for the acknowledgement from the Secondary              */

	if (type & SHUT_WHO_SECDRY) {
		if (failover_send_shutdown(FAILOVER_SecdShutdown) == 0)
			wait_for_secondary = 1;
	} else if (type & SHUT_WHO_IDLESECDRY) {
		if (failover_send_shutdown(FAILOVER_SecdGoInactive) == 0)
			wait_for_secondary = 1;
	}

	/* what is the manner of our demise? */

	type = type & SHUT_MASK;
	if (type == SHUT_IMMEDIATE) {
		*state = SV_STATE_SHUTIMM;
		(void)strcat(log_buffer, "Immediate");

	} else if (type == SHUT_DELAY) {
		*state = SV_STATE_SHUTDEL;
		(void)strcat(log_buffer, "Delayed");

	} else if (type == SHUT_QUICK) {
		*state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
		(void)strcat(log_buffer, "Quick");

	} else {
		*state = SV_STATE_DOWN;
		(void)strcat(log_buffer, "By Signal");
		type = SHUT_QUICK;
	}
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer);

	if (wait_for_secondary)
		*state |= SV_STATE_PRIMDLY; /* wait for reply from Secondary */

	if (type == SHUT_QUICK) /* quick, leave jobs as are */
		return;

	svr_save_db(&server, SVR_SAVE_QUICK);

	pnxt = (job *)GET_NEXT(svr_alljobs);
	while ((pjob = pnxt) != (job *)0) {
		pnxt = (job *)GET_NEXT(pjob->ji_alljobs);

		if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {

			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART;
			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;
			pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt];
			if ((pattr->at_val.at_str) &&
				(*pattr->at_val.at_str != 'n')) {
				/* do checkpoint of job */

				if (shutdown_chkpt(pjob) == 0)
					continue;
			}

			/* if not checkpoint (not supported, not allowed, or fails */
			/* rerun if possible, else kill job			   */

			rerun_or_kill(pjob, msg_on_shutdown);
		}
	}
	return;
}
Пример #5
0
void svr_shutdown(

  int type) /* I */

  {
  attribute *pattr;
  job     *pjob;
  job     *pnxt;
  long     *state;

  /* Lets start by logging shutdown and saving everything */

  state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long;

  strcpy(log_buffer, msg_shutdown_start);

  if (*state == SV_STATE_SHUTIMM)
    {
    /* if already shuting down, another Immed/sig will force it */

    if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG))
      {
      *state = SV_STATE_DOWN;

      strcat(log_buffer, "Forced");

      log_event(
        PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
        PBS_EVENTCLASS_SERVER,
        msg_daemonname,
        log_buffer);

      return;
      }
    }

  if (type == SHUT_IMMEDIATE)
    {
    *state = SV_STATE_SHUTIMM;

    strcat(log_buffer, "Immediate");
    }
  else if (type == SHUT_DELAY)
    {
    *state = SV_STATE_SHUTDEL;

    strcat(log_buffer, "Delayed");
    }
  else if (type == SHUT_QUICK)
    {
    *state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */

    strcat(log_buffer, "Quick");
    }
  else
    {
    *state = SV_STATE_SHUTIMM;

    strcat(log_buffer, "By Signal");
    }

  log_event(

    PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
    PBS_EVENTCLASS_SERVER,
    msg_daemonname,
    log_buffer);

  if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */
    {
    return;
    }

  svr_save(&server, SVR_SAVE_QUICK);

  pnxt = (job *)GET_NEXT(svr_alljobs);

  while ((pjob = pnxt) != NULL)
    {
    pnxt = (job *)GET_NEXT(pjob->ji_alljobs);

    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN;

      pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];

      if ((pattr->at_flags & ATR_VFLAG_SET) &&
          ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL)))
        {
        /* do checkpoint of job */

        if (shutdown_checkpoint(pjob) == 0)
          continue;
        }

      /* if no checkpoint (not supported, not allowed, or fails */
      /* rerun if possible, else kill job */

      rerun_or_kill(pjob, msg_on_shutdown);
      }
    }

  return;
  }  /* END svr_shutdown() */
Пример #6
0
void svr_shutdown(

  int type) /* I */

  {
  pbs_attribute *pattr;
  job           *pjob;
  long           state = SV_STATE_DOWN;
  int            iter;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  close(lockfds);

  save_queues();

  /* Lets start by logging shutdown and saving everything */
  get_svr_attr_l(SRV_ATR_State, &state);

  strcpy(log_buf, msg_shutdown_start);

  if (state == SV_STATE_SHUTIMM)
    {
    /* if already shuting down, another Immed/sig will force it */
    if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG))
      {
      state = SV_STATE_DOWN;
      set_svr_attr(SRV_ATR_State, &state);

      strcat(log_buf, "Forced");

      log_event(
        PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
        PBS_EVENTCLASS_SERVER,
        msg_daemonname,
        log_buf);

      return;
      }
    }

  if (type == SHUT_IMMEDIATE)
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Immediate");
    }
  else if (type == SHUT_DELAY)
    {
    state = SV_STATE_SHUTDEL;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Delayed");
    }
  else if (type == SHUT_QUICK)
    {
    state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Quick");
    }
  else
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "By Signal");
    }

  log_event(
    PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
    PBS_EVENTCLASS_SERVER,
    msg_daemonname,
    log_buf);

  if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */
    {
    return;
    }

  svr_save(&server, SVR_SAVE_QUICK);

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL)
    {
    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN;

      pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

      if ((pattr->at_flags & ATR_VFLAG_SET) &&
          ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL)))
        {
        /* do checkpoint of job */

        if (shutdown_checkpoint(&pjob) == 0)
          {
          if (pjob != NULL)
            unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

          continue;
          }
        }

      /* if no checkpoint (not supported, not allowed, or fails */
      /* rerun if possible, else kill job */

      rerun_or_kill(&pjob, msg_on_shutdown);
      }

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    }

  return;
  }  /* END svr_shutdown() */