Beispiel #1
0
static void
post_rerun(struct work_task *pwt)
{
	job	*pjob;
	struct batch_request *preq;

	preq = (struct batch_request *)pwt->wt_parm1;

	if (preq->rq_reply.brp_code != 0) {
		if ((pjob = find_job(preq->rq_ind.rq_signal.rq_jid)) != NULL) {
			(void)sprintf(log_buffer, "rerun signal reject by mom: %d",
				preq->rq_reply.brp_code);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				preq->rq_ind.rq_signal.rq_jid, log_buffer);

			if ((preq->rq_reply.brp_code == PBSE_UNKJOBID) &&
				(preq->rq_extra == 0)) {
				pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3;
				discard_job(pjob, "Force rerun", 1);
				force_reque(pjob);
			}
		}
	}
	release_req(pwt);
	return;
}
Beispiel #2
0
static void
post_chkpt(struct work_task *ptask)
{
	job		     *pjob;
	struct batch_request *preq;

	preq = (struct batch_request *)ptask->wt_parm1;
	pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);
	if (!preq || !pjob)
		return;
	if (preq->rq_reply.brp_code == 0) {
		/* checkpointed ok */
		if (preq->rq_reply.brp_auxcode) { /* chkpt can be moved */
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_ChkptMig;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
		}
		account_record(PBS_ACCT_CHKPNT, pjob, (char *)0);
	} else {
		/* need to try rerun if possible or just abort the job */
		if (preq->rq_reply.brp_code != PBSE_CKPBSY) {
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
			pjob->ji_modified = 1;
			(void)job_save(pjob, SAVEJOB_QUICK);
			if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
				rerun_or_kill(pjob, msg_on_shutdown);
		}
	}

	release_req(ptask);
}
void chkpt_xfr_hold(

  struct work_task *ptask)

  {
  job       *pjob;
  struct work_task *ptasknew;

  struct batch_request *preq;

  preq = (struct batch_request *)ptask->wt_parm1;
  pjob = (job *)preq->rq_extra;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buffer,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);
    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }
  
  release_req(ptask);

  ptasknew = set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, (void*)pjob);

  return;
  }  /* END chkpt_xfr_hold() */
Beispiel #4
0
void chkpt_xfr_done(

  struct work_task *ptask)

  {
  /* Why are we grabbing a pointer to the job or the request here??? 
   * Nothing is done??!!?? 
   * If implemented later, thread protection must be added */
  
  release_req(ptask);

  return;
  }  /* END chkpt_xfr_done() */
void chkpt_xfr_done(

  struct work_task *ptask)

  {
  job       *pjob;

  struct batch_request *preq;

  preq = (struct batch_request *)ptask->wt_parm1;
  pjob = (job *)preq->rq_extra;
  
  release_req(ptask);

  return;
  }  /* END chkpt_xfr_done() */
Beispiel #6
0
static void job_delete_nanny(

  struct work_task *pwt)

  {
  job *pjob;
  char *sigk = "SIGKILL";

  struct batch_request *newreq;

  /* short-circuit if nanny isn't enabled */

  if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long)
    {
    release_req(pwt);

    return;
    }

  pjob = (job *)pwt->wt_parm1;

  sprintf(log_buffer, "exiting job '%s' still exists, sending a SIGKILL",
          pjob->ji_qs.ji_jobid);

  log_err(-1, "job nanny", log_buffer);

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
    {
    strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
    strncpy(newreq->rq_ind.rq_signal.rq_signame, sigk, PBS_SIGNAMESZ);
    }

  issue_signal(pjob, sigk, post_job_delete_nanny, newreq);


  apply_job_delete_nanny(pjob, time_now + 60);

  return;
  } /* END job_delete_nanny() */
Beispiel #7
0
static void post_checkpoint(

  struct work_task *ptask)

  {
  job                  *pjob;

  struct batch_request *preq;

  preq = (struct batch_request *)ptask->wt_parm1;
  pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);

  if (preq->rq_reply.brp_code == 0)
    {
    /* checkpointed ok */
    if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags =
        (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) |
        JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;

      }
    }
  else
    {
    /* need to try rerun if possible or just abort the job */

    if (pjob)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        rerun_or_kill(pjob, msg_on_shutdown);
      }
    }

  release_req(ptask);
  }  /* END post_checkpoint() */
Beispiel #8
0
static void
post_doq(struct work_task *pwt)
{
	struct batch_request *preq = (struct batch_request *)pwt->wt_parm1;
	char *jobid = preq->rq_ind.rq_register.rq_child;
	char *msg;
	job  *pjob;
	job  *ppjob;
	struct depend_job pparent;
	int rc;

	if (preq->rq_reply.brp_code) {
		/* request was rejected */

		(void)strcpy(log_buffer, msg_regrej);
		(void)strcat(log_buffer, preq->rq_ind.rq_register.rq_parent);

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
			jobid, log_buffer);
		pjob = find_job(jobid);
		if ((msg = pbse_to_txt(preq->rq_reply.brp_code)) != NULL) {
			(void)strcat(log_buffer, " ");
			(void)strcat(log_buffer, msg);
		}
		if (pjob) {
			if (preq->rq_reply.brp_code == PBSE_JOB_MOVED) {
				/* Creating a separate log buffer because if we end up aborting the submitted job
				 * we don't want to change what goes into accounting log via job_abt
				 */
				char log_msg[LOG_BUF_SIZE];
				snprintf(log_msg, sizeof(log_msg), "%s, %s", msg_job_moved,
					"sending dependency request to remote server");
				log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_msg);
				ppjob = find_job(preq->rq_ind.rq_register.rq_parent);
				if(ppjob && (ppjob->ji_qs.ji_state == JOB_STATE_MOVED) && (ppjob->ji_qs.ji_substate == JOB_SUBSTATE_MOVED)) {
					char *destin;
					/* job destination should be <remote queue>@<remote server> */
					destin = strchr(ppjob->ji_qs.ji_destin, (int)'@');
					if (destin != NULL) {
						strncpy(pparent.dc_child, ppjob->ji_qs.ji_jobid, sizeof(pparent.dc_child));
						strncpy(pparent.dc_svr, destin+1, sizeof(pparent.dc_svr));
						rc = send_depend_req(pjob, &pparent, preq->rq_ind.rq_register.rq_dependtype,
							JOB_DEPEND_OP_REGISTER,
							SYNC_SCHED_HINT_NULL, post_doq);
						if (rc) {
							snprintf(log_msg, sizeof(log_msg), "%s",
								"Failed to send dependency request to remote server, aborting job");
							log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, jobid, log_msg);
							check_block(pjob, log_buffer);
							job_abt(pjob, log_buffer);
						}
					}
					else {
						/* Ideally if a job is moved, destination can not be empty */
						/* If we come across an empty destination, abort the job */
						check_block(pjob, log_buffer);
						job_abt(pjob, log_buffer);
					}
				}
				else {
					check_block(pjob, log_buffer);
					job_abt(pjob, log_buffer);
				}
			}
			else {
				check_block(pjob, log_buffer);
				job_abt(pjob, log_buffer);
			}
		}
	}

	release_req(pwt);
}
Beispiel #9
0
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int         delay = 0;
  int        dellen = strlen(deldelaystr);
  job       *pjob;

  struct work_task   *pwtnew;
  pbs_queue      *pque;

  struct batch_request *preq_sig;  /* signal request to MOM */

  struct batch_request *preq_clt;  /* original client request */
  int        rc;

  preq_sig = pwt->wt_parm1;
  rc       = preq_sig->rq_reply.brp_code;
  preq_clt = preq_sig->rq_extra;

  release_req(pwt);

  pjob = find_job(preq_clt->rq_ind.rq_delete.rq_objname);

  if (pjob == NULL)
    {
    /* job has gone away */

    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */

  if (delay == 0)
    {
    pque = pjob->ji_qhdr;

    delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
    }

  pwtnew = set_task(WORK_Timed, delay + time_now, post_delete_mom2, pjob);

  if (pwtnew)
    {
    /* insure that work task will be removed if job goes away */

    append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
    }

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */

  apply_job_delete_nanny(pjob, time_now + delay + 60);

  return;
  }  /* END post_delete_mom1() */
Beispiel #10
0
static void post_job_delete_nanny(

  struct work_task *pwt)

  {

  struct batch_request *preq_sig;                /* signal request to MOM */

  int   rc;
  job  *pjob;

  preq_sig = pwt->wt_parm1;
  rc       = preq_sig->rq_reply.brp_code;


  if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long)
    {
    /* the admin disabled nanny within the last minute or so */

    release_req(pwt);

    return;
    }

  /* extract job id from task */

  pjob = find_job(preq_sig->rq_ind.rq_signal.rq_jid);

  if (pjob == NULL)
    {
    sprintf(log_buffer, "job delete nanny: the job disappeared (this is a BUG!)");

    LOG_EVENT(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      preq_sig->rq_ind.rq_signal.rq_jid,
      log_buffer);
    }
  else if (rc == PBSE_UNKJOBID)
    {
    sprintf(log_buffer, "job delete nanny returned, but does not exist on mom");

    LOG_EVENT(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      preq_sig->rq_ind.rq_signal.rq_jid,
      log_buffer);

    free_nodes(pjob);

    set_resc_assigned(pjob, DECR);

    job_purge(pjob);
    }

  /* free task */

  release_req(pwt);

  return;
  } /* END post_job_delete_nanny() */
Beispiel #11
0
static void post_stagein(

  struct work_task *pwt)

  {
  int        code;
  int        newstate;
  int        newsub;
  job       *pjob;

  struct batch_request *preq;
  attribute      *pwait;

  preq = pwt->wt_parm1;
  code = preq->rq_reply.brp_code;
  pjob = find_job(preq->rq_extra);

  free(preq->rq_extra);

  if (pjob != NULL)
    {
    if (code != 0)
      {
      /* stage in failed - hold job */

      free_nodes(pjob);

      pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime];

      if ((pwait->at_flags & ATR_VFLAG_SET) == 0)
        {
        pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT;

        pwait->at_flags |= ATR_VFLAG_SET;

        job_set_wait(pwait, pjob, 0);
        }

      svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL);

      if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text)
        {
        /* set job comment */

        /* NYI */

        svr_mailowner(
          pjob,
          MAIL_STAGEIN,
          MAIL_FORCE,
          preq->rq_reply.brp_un.brp_txt.brp_str);
        }
      }
    else
      {
      /* stage in was successful */

      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_StagedIn;

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO)
        {
        if (is_checkpoint_restart(pjob))
          {
          /* need to copy checkpoint file to mom before running */
          svr_send_checkpoint(
              pjob,
              preq,
              JOB_STATE_RUNNING,
              JOB_SUBSTATE_CHKPTGO);
          }
        else
          {
          /* continue to start job running */

          svr_strtjob2(pjob, NULL);
          }
        }
      else
        {
        svr_evaljobstate(pjob, &newstate, &newsub, 0);

        svr_setjobstate(pjob, newstate, newsub);
        }
      }
    }    /* END if (pjob != NULL) */

  release_req(pwt); /* close connection and release request */

  return;
  }  /* END post_stagein() */
Beispiel #12
0
static void post_checkpointsend(

  struct work_task *pwt)

  {
  int        code;
  job       *pjob;

  struct batch_request *preq;
  attribute      *pwait;

  preq = pwt->wt_parm1;
  code = preq->rq_reply.brp_code;
  pjob = find_job(preq->rq_extra);

  free(preq->rq_extra);

  if (pjob != NULL)
    {
    if (code != 0)
      {
      /* copy failed - hold job */

      free_nodes(pjob);

      pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime];

      if ((pwait->at_flags & ATR_VFLAG_SET) == 0)
        {
        pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT;

        pwait->at_flags |= ATR_VFLAG_SET;

        job_set_wait(pwait, pjob, 0);
        }

      svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL);

      if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text)
        {

        sprintf(log_buffer, "Failed to copy checkpoint file to mom - %s",
                preq->rq_reply.brp_un.brp_txt.brp_str);

        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buffer);

        /* NYI */

        svr_mailowner(
          pjob,
          MAIL_CHKPTCOPY,
          MAIL_FORCE,
          preq->rq_reply.brp_un.brp_txt.brp_str);
        }
      }
    else
      {
      /* checkpoint copy was successful */

      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_COPIED;
      
      /* set restart_name attribute to the checkpoint_name we just copied */
      
      job_attr_def[(int)JOB_ATR_restart_name].at_set(
        &pjob->ji_wattr[(int)JOB_ATR_restart_name],
        &pjob->ji_wattr[(int)JOB_ATR_checkpoint_name],
        SET);

      pjob->ji_modified = 1;
      
      job_save(pjob, SAVEJOB_FULL);
      
      /* continue to start job running */

      svr_strtjob2(pjob, NULL);
      }
    }    /* END if (pjob != NULL) */

  release_req(pwt); /* close connection and release request */

  return;
  }  /* END post_checkpointsend() */
Beispiel #13
0
static void stat_update(

  struct work_task *pwt)

  {

  struct stat_cntl     *cntl;
  job                  *pjob;

  struct batch_request *preq;

  struct batch_reply   *preply;

  struct brp_status    *pstatus;
  svrattrl        *sattrl;
  int    oldsid;

  preq = pwt->wt_parm1;
  preply = &preq->rq_reply;
  cntl = preq->rq_extra;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = find_job(pstatus->brp_objname)))
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL);

          svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL);
          }
#endif    /* USESAVEDRESOURCES */


        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = find_job(preq->rq_ind.rq_status.rq_id)))
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }

  release_req(pwt);

  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */
  else
    free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */