コード例 #1
0
ファイル: test_uut.c プロジェクト: dooodlesomething/torque
END_TEST




START_TEST(rel_resc_test)
  {
  job pjob;

  rel_resc(&pjob);
  fail_unless(svr_do_schedule == SCH_SCHEDULE_TERM);
  fail_unless(listener_command == SCH_SCHEDULE_TERM);
  }
コード例 #2
0
ファイル: req_rerun.c プロジェクト: vinodchitrali/pbspro
/**
 * @brief
 * 		force_reque - requeue (rerun) a job
 *
 * @param[in,out]	pwt	-	job which needs to be rerun
 */
void
force_reque(job *pjob)
{
	int  newstate;
	int  newsubstate;

	pjob->ji_modified = 1;
	pjob->ji_momhandle = -1;
	pjob->ji_mom_prot = PROT_INVALID;

	/* simulate rerun: free nodes, clear checkpoint flag, and */
	/* clear exec_vnode string				  */

	rel_resc(pjob);

	/* note in accounting file */
	account_jobend(pjob, pjob->ji_acctrec, PBS_ACCT_RERUN);

	/* if a subjob,  we set substate to RERUN3 to cause trktbl entry */
	/* to be reset to Qeued, and then blow away the job struct       */

	if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) {
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3;
		job_purge(pjob);
		return;
	}

	/*
	 * Clear any JOB_SVFLG_Actsuspd flag too, as the job is no longer
	 * suspended (User busy).  A suspended job is rerun in case of a
	 * MOM failure after the workstation becomes active(busy).
	 */
	pjob->ji_qs.ji_svrflags &= ~(JOB_SVFLG_Actsuspd | JOB_SVFLG_StagedIn | JOB_SVFLG_CHKPT);
	job_attr_def[(int)JOB_ATR_exec_host].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_host]);
	job_attr_def[(int)JOB_ATR_exec_host2].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_host2]);
	job_attr_def[(int)JOB_ATR_exec_vnode].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_vnode]);
	job_attr_def[(int)JOB_ATR_pset].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_pset]);
	/* job dir has no meaning for re-queued jobs, so unset it */
	job_attr_def[(int)JOB_ATR_jobdir].at_free(&pjob->
		ji_wattr[(int)JOB_ATR_jobdir]);
	svr_evaljobstate(pjob, &newstate, &newsubstate, 1);
	(void)svr_setjobstate(pjob, newstate, newsubstate);
}
コード例 #3
0
ファイル: req_rerun.c プロジェクト: dkoes/torque
/*
 * requeue_job_without_contacting_mom()
 *
 */
void requeue_job_without_contacting_mom(

  job &pjob)

  {
  if (pjob.ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    rel_resc(&pjob);
    svr_setjobstate(&pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
    pjob.ji_wattr[JOB_ATR_exec_host].at_flags &= ~ATR_VFLAG_SET;

    if (pjob.ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
      {
      free(pjob.ji_wattr[JOB_ATR_exec_host].at_val.at_str);
      pjob.ji_wattr[JOB_ATR_exec_host].at_val.at_str = NULL;
      }
    }
  } /* END requeue_job_without_contacting_mom() */
コード例 #4
0
ファイル: req_rerun.c プロジェクト: dkoes/torque
int finalize_rerunjob(
    
  batch_request *preq,
  job           *pjob,
  int            rc)

  {
  int       Force;
  char      log_buf[LOCAL_LOG_BUF_SIZE];

  if (pjob == NULL)
    return(PBSE_BAD_PARAMETER);

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case -1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;

        if ((cray_enabled == true) &&
            (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
        else
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);

        /* Cannot communicate with MOM, forcibly requeue job.
           This is a relatively disgusting thing to do */

        sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
          tmp, rc);

        free(tmp);

        log_event(
          PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buf);

        log_err(-1, __func__, log_buf);

        strcat(log_buf, ", previous output files may be lost");

        svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);

        svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);

        rel_resc(pjob); /* free resc assigned to job */

        pjob->ji_modified = 1;    /* force full job save */

        pjob->ji_momhandle = -1;
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

        svr_evaljobstate(*pjob, newstate, newsubst, 0);
        svr_setjobstate(pjob, newstate, newsubst, FALSE);
        }

      break;
    }  /* END switch (rc) */

  pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
      ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
        JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;

  sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  reply_ack(preq);

  /* note in accounting file */
  account_record(PBS_ACCT_RERUN, pjob, NULL);

  return rc;
  }  /* END req_rerunjob() */
コード例 #5
0
ファイル: req_stat.c プロジェクト: craigprescott/torque
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);
  char                 *msg_ptr = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preply = &preq->rq_reply;

  if (preply->brp_un.brp_txt.brp_str != NULL)
    {
    msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL);
  
    if (msg_ptr != NULL)
      msg_ptr += strlen(PBS_MSG_EQUAL);
    }

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) &&
           (preply->brp_code == PBSE_UNKJOBID) &&
           (msg_ptr != NULL) &&
           (!strcmp(msg_ptr,  preq->rq_ind.rq_status.rq_id)))
    {
    /* we sent a stat request, but mom says it doesn't know anything about
       the job */
    if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
      {
      /* job really isn't running any more - mom doesn't know anything about it
         this can happen if a diskless node reboots and the mom_priv/jobs
         directory is cleared, set its state to queued so job_abt doesn't
         think it is still running */
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      snprintf(log_buf, sizeof(log_buf),
        "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld",
        preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
      rel_resc(pjob);
      job_mutex.set_unlock_on_exit(false);
      job_abt(&pjob, "Job does not exist on node");

      /* TODO, if the job is rerunnable we should set its state back to queued */
      }
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id);
    log_err(preply->brp_code, __func__, log_buf);
    }
  
  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
コード例 #6
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);

  preply = &preq->rq_reply;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }
  cntl->sc_conn = -1;

  /* MUTSU - Unlock job here? */
  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
コード例 #7
0
ファイル: req_rerun.c プロジェクト: actorquedeveloper/torque
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
コード例 #8
0
ファイル: req_signal.c プロジェクト: adaptivecomputing/torque
int issue_signal(

  job        **pjob_ptr,
  const char  *signame, /* name of the signal to send */
  void       (*func)(struct batch_request *),
  void        *extra, /* extra parameter to be stored in sig request */
  char        *extend) /* Parameter to put in extended part of request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;
  newreq->rq_extend = extend;
  if (extend != NULL)
    {
    newreq->rq_extsz = strlen(extend);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else if ((extend != NULL) && 
      (!strcmp(extend, RERUNFORCE)))
    {
    if (pjob == NULL)
      {
      *pjob_ptr = svr_find_job((char *)jobid, TRUE);
      pjob = *pjob_ptr;
      }
    /* The job state is normally set when the obit arrives. But since the 
       MOM is not responding we need to set the state here */

    if (pjob != NULL)
      {
      /* Rerunning job, if not checkpointed, clear "resources_used and requeue job */
      if ((pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) == 0)
        {
        job_attr_def[JOB_ATR_resc_used].at_free(&pjob->ji_wattr[JOB_ATR_resc_used]);
        }
      else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
        {
        /* non-migratable checkpoint (cray), leave there */
        /* and just requeue the job         */

        rel_resc(pjob);

        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;

        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);

        pjob->ji_momhandle = -1;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);

        return(PBSE_SYSTEM);
        }

      rel_resc(pjob); /* free resc assigned to job */

      /* Now re-queue the job */
      pjob->ji_modified = 1; /* force full job save */

      pjob->ji_momhandle = -1;
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
      func(newreq);

      rc = PBSE_NONE;
      }
    else
      rc = PBSE_JOBNOTFOUND;
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */
コード例 #9
0
ファイル: req_stat.c プロジェクト: risyomei-poppin-games/PBS
static void stat_update(

  struct work_task *pwt)

  {

  struct stat_cntl     *cntl;
  job                  *pjob;

  struct batch_request *preq;

  struct batch_reply   *preply;

  struct brp_status    *pstatus;
  svrattrl        *sattrl;
  int    oldsid;

  preq = pwt->wt_parm1;
  preply = &preq->rq_reply;
  cntl = preq->rq_extra;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = find_job(pstatus->brp_objname)))
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL);

          svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL);
          }
#endif    /* USESAVEDRESOURCES */


        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = find_job(preq->rq_ind.rq_status.rq_id)))
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }

  release_req(pwt);

  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */
  else
    free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */