Exemple #1
0
void post_rerun(

  batch_request *preq)

  {
  int   newstate;
  int   newsub;
  job  *pjob;

  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buf, "rerun signal reject by mom: %s - %d", preq->rq_ind.rq_signal.rq_jid, preq->rq_reply.brp_code);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,__func__,log_buf);

    if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE)))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }
    }

  return;
  }  /* END post_rerun() */
/*
 * release_job - releases the hold on job j
 * @param j - the job to modify
 * @return 0 if successful, a PBS error on failure
 */
int release_job(

  struct batch_request *preq, /* I */
  void                 *j)    /* I/O */

  {
  long           old_hold;
  int            rc = 0;
  int            newstate;
  int            newsub;
  char          *pset;
  job           *pjob = (job *)j;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  pbs_attribute  temphold;

  /* cannot do anything until we decode the holds to be set */

  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0)
    {
    return(rc);
    }

  /* if other than HOLD_u is being released, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    return(rc);
    }

  /* unset the hold */

  old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR)))
    {
    return(rc);
    }

  /* everything went well, if holds changed, update the job state */

  if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long)
    {
    pjob->ji_modified = 1; /* indicates attributes changed */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);

    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
    }

  sprintf(log_buf, msg_jobholdrel,
    pset,
    preq->rq_user,
    preq->rq_host);

  log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

  return(rc);
  } /* END release_job() */
Exemple #3
0
void hold_job(

  attribute *temphold, /* I */
  void      *j)        /* I */

  {
  long *hold_val;
  long old_hold;

  int newstate;
  int newsub;

  attribute *pattr;
  job *pjob = (job *)j;

  if (pjob == NULL)
    return;

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;
  old_hold = *hold_val;
  *hold_val |= temphold->at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  
  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];
  
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* TODO */
    /* preq_tmp = alloc_br(preq->rq_type); */
    
    }
  else if (old_hold != *hold_val)
    {
    /* indicate attributes changed  */
    
    pjob->ji_modified = 1;

    svr_evaljobstate(pjob, &newstate, &newsub, 0);

    svr_setjobstate(pjob, newstate, newsub);
    }

  }
Exemple #4
0
/**
 * @brief
 * 		force_reque - requeue (rerun) a job
 *
 * @param[in,out]	pwt	-	job which needs to be rerun
 */
void
force_reque(job *pjob)
{
	int  newstate;
	int  newsubstate;

	pjob->ji_modified = 1;
	pjob->ji_momhandle = -1;
	pjob->ji_mom_prot = PROT_INVALID;

	/* simulate rerun: free nodes, clear checkpoint flag, and */
	/* clear exec_vnode string				  */

	rel_resc(pjob);

	/* note in accounting file */
	account_jobend(pjob, pjob->ji_acctrec, PBS_ACCT_RERUN);

	/* if a subjob,  we set substate to RERUN3 to cause trktbl entry */
	/* to be reset to Qeued, and then blow away the job struct       */

	if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) {
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3;
		job_purge(pjob);
		return;
	}

	/*
	 * Clear any JOB_SVFLG_Actsuspd flag too, as the job is no longer
	 * suspended (User busy).  A suspended job is rerun in case of a
	 * MOM failure after the workstation becomes active(busy).
	 */
	pjob->ji_qs.ji_svrflags &= ~(JOB_SVFLG_Actsuspd | JOB_SVFLG_StagedIn | JOB_SVFLG_CHKPT);
	job_attr_def[(int)JOB_ATR_exec_host].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_host]);
	job_attr_def[(int)JOB_ATR_exec_host2].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_host2]);
	job_attr_def[(int)JOB_ATR_exec_vnode].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_exec_vnode]);
	job_attr_def[(int)JOB_ATR_pset].at_free(
		&pjob->ji_wattr[(int)JOB_ATR_pset]);
	/* job dir has no meaning for re-queued jobs, so unset it */
	job_attr_def[(int)JOB_ATR_jobdir].at_free(&pjob->
		ji_wattr[(int)JOB_ATR_jobdir]);
	svr_evaljobstate(pjob, &newstate, &newsubstate, 1);
	(void)svr_setjobstate(pjob, newstate, newsubstate);
}
Exemple #5
0
END_TEST

START_TEST(svr_evaljobstate_test)
  {
  struct job test_job;
  int state = 0;
  int substate = 0;

  memset(&test_job, 0, sizeof(test_job));

  svr_evaljobstate(NULL, &state, &substate, 0);
  svr_evaljobstate(&test_job, NULL, &substate, 0);
  svr_evaljobstate(&test_job, &state, NULL, 0);

  test_job.ji_qs.ji_state = JOB_STATE_RUNNING;
  svr_evaljobstate(&test_job, &state, &substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 1");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 1");
  memset(&test_job, 0, sizeof(test_job));

  test_job.ji_wattr[JOB_ATR_hold].at_val.at_long = 1;
  svr_evaljobstate(&test_job, &state, &substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 2");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 2");
  memset(&test_job, 0, sizeof(test_job));

  test_job.ji_wattr[JOB_ATR_stagein].at_flags = 1;
  svr_evaljobstate(&test_job, &state, &substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 3");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 3");
  memset(&test_job, 0, sizeof(test_job));

  svr_evaljobstate(&test_job, &state, &substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 4");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 4");
  memset(&test_job, 0, sizeof(test_job));

  svr_evaljobstate(&test_job, &state, &substate, 1);
  fail_unless(JOB_STATE_QUEUED == state, "svr_setjobstate state fail case 5");
  fail_unless(JOB_SUBSTATE_QUEUED == substate, "svr_setjobstate substate fail case 5");

  }
Exemple #6
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Exemple #7
0
void req_deletejob(

  struct batch_request *preq)  /* I */

  {
  job              *pjob;

  struct work_task *pwtold;

  struct work_task *pwtnew;
  struct work_task *pwtcheck;

  int               rc;
  char             *sigt = "SIGTERM";

  char             *Msg = NULL;

  /* check if we are getting a purgecomplete from scheduler */
  if ((preq->rq_extend != NULL) && 
        !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP)))
    {

    /*
     * purge_completed_jobs will respond with either an ack or reject
     */
    purge_completed_jobs(preq);

    return;
    }

  /* The way this is implemented, if the user enters the command "qdel -p <jobid>",
   * they can then delete jobs other than their own since the authorization
   * checks are made below in chk_job_request. This should probably be fixed.
   */

  if (forced_jobpurge(preq) != 0)
    {
    return;
    }

  /* NOTE:  should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */

  /* NYI */

  pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq);

  if (pjob == NULL)
    {
    /* NOTE:  chk_job_request() will issue req_reject() */

    return;
    }

  if (preq->rq_extend != NULL)
    {
    if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) &&
        strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) &&
        strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      /* have text message in request extension, add it */

      Msg = preq->rq_extend;

      /*
       * Message capability is only for operators and managers.
       * Check if request is authorized
      */

      if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
                            ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
        {
        req_reject(PBSE_PERM, 0, preq, NULL,
                   "must have operator or manager privilege to use -m parameter");
        return;
        }
      }
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */

    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);

    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        pwtnew = set_task(
                   pwtold->wt_type,
                   pwtold->wt_event,
                   post_delete_route,
                   preq);

        if (pwtnew != NULL)
          {
          /*
           * reset type in case the SIGCHLD came
           * in during the set_task;  it makes
           * sure that next_task() will find the
           * new entry.
           */

          pwtnew->wt_type = pwtold->wt_type;
          pwtnew->wt_aux = pwtold->wt_aux;

          kill((pid_t)pwtold->wt_event, SIGTERM);

          pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;

          return; /* all done for now */
          }
        else
          {
          req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

          return;
          }
        }

      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    /* should never get here ...  */

    log_err(-1, "req_delete", "Did not find work task for router");

    req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL);

    return;
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    pwtnew = set_task(
               WORK_Timed,
               time_now + 1,
               post_delete_route,
               preq);

    if (pwtnew == 0)
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buffer, "requestor=%s@%s",
          preq->rq_user,
          preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */

  account_record(PBS_ACCT_DEL, pjob, log_buffer);

  sprintf(log_buffer, msg_manager,
          msg_deletejob,
          preq->rq_user,
          preq->rq_host);

  log_event(
    PBSEVENT_JOB,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    log_buffer);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */

    strcat(log_buffer, "\n");
    strcat(log_buffer, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      !has_job_delete_nanny(pjob))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (has_job_delete_nanny(pjob))
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return;
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /* check if we are getting a asynchronous delete */

    if ((preq->rq_extend != NULL) &&
          !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC)))
      {
      struct batch_request *preq_tmp = NULL;
      /*
       * Respond with an ack now instead of after MOM processing
       * Create a new batch request and fill it in. It will be freed by reply_ack
       */

      snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously");
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer);

      preq_tmp = alloc_br(PBS_BATCH_DeleteJob);
      preq_tmp->rq_perm = preq->rq_perm;
      preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd;
      preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype;
      preq_tmp->rq_fromsvr = preq->rq_fromsvr;
      preq_tmp->rq_extsz = preq->rq_extsz;
      preq_tmp->rq_conn = preq->rq_conn;
      memcpy(preq_tmp->rq_ind.rq_manager.rq_objname,
          preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1);
      memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1);
      memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1);

      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      }
  
    /* make a cleanup task if set */
    if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
      {
      pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
      if (pwtcheck != NULL)
        append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
      }

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */

    if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq)))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */

    sprintf(log_buffer, msg_delrunjobsig,
            sigt);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    return;
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
      (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
    {
    pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
    if (pwtcheck != NULL)
      append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = pjob->ji_arraystruct;

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->jobs[i] == NULL)
          continue;

        tmp = (job *)pa->jobs[i];

        if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
          {
          tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
              
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
            }
          
          svr_evaljobstate(tmp, &newstate, &newsub, 1);
          svr_setjobstate(tmp, newstate, newsub);
          job_save(tmp, SAVEJOB_FULL, 0);

          break;
          }
        }
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;

    /* force new connection */

    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);

    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(pjob);

    job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);

    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }

    KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);

    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  reply_ack(preq);

  return;
  }  /* END req_deletejob() */
Exemple #8
0
/**
 *
 * @brief
 * 		Send a job over the network to some other server or MOM.
 * @par
 * 		Under Linux/Unix, this starts a child process to do the work.
 *		Connect to the destination host and port,
 * 		and go through the protocol to transfer the job.
 * 		Signals are blocked.
 *
 * @param[in]	jobp	-	pointer to the job being sent.
 * @param[in]	hostaddr	-	the address of host to send job to, host byte order.
 * @param[in]	port	-	the destination port, host byte order
 * @param[in]	move_type	-	the type of move (e.g. MOVE_TYPE_exec)
 * @param[in]	post_func	-	the function to execute once the child process
 *								sending job completes (Linux/Unix only)
 * @param[in]	data	-	input data to 'post_func'
 *
 * @return	int
 * @retval	2	parent	: success (child forked)
 * @retval	-1	parent	: on failure (pbs_errno set to error number)
 * @retval	SEND_JOB_OK	child	: 0 success, job sent
 * @retval	SEND_JOB_FATAL	child	: 1 permenent failure or rejection,
 * @retval	SEND_JOB_RETRY	child	: 2 failed but try again
 * @retval	SEND_JOB_NODEDW child	: 3 execution node down, retry different node
 */
int
send_job(job *jobp, pbs_net_t hostaddr, int port, int move_type,
	void (*post_func)(struct work_task *), struct batch_request *preq)
{

#ifdef WIN32
	char	cmdline[80];
	pio_handles	pio;
	char	buf[4096];
	struct work_task *ptask;
	int	newstate;
	int	newsub;
	long	tempval;
	char	script_name[MAXPATHLEN+1];
	int 		gridproxy_cred = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	sprintf(cmdline, "%s/sbin/pbs_send_job", pbs_conf.pbs_exec_path);

	if (win_popen(cmdline, "w", &pio, NULL) == 0) {
		errno = GetLastError();
		pbs_errno = errno;
		(void)sprintf(log_buffer, "executing %s for job %s failed errno=%d", cmdline, jobp->ji_qs.ji_jobid, errno);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_ERR,
			jobp->ji_qs.ji_jobid, log_buffer);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);

		win_pclose(&pio);
		return (-1);
	}

	ptask = set_task(WORK_Deferred_Child, (long)pio.pi.hProcess, post_func, preq);
	if (!ptask) {
		log_err(errno, __func__, msg_err_malloc);
		errno = ENOMEM;
		pbs_errno = errno;
		win_pclose(&pio);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);
		return (-1);
	} else {
		ptask->wt_parm2 = jobp;
		append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask);
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, &script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			win_pclose2(&pio);
			return (-1);
		}
	}

	addpid(pio.pi.hProcess);

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	/* in windows code, a child process "w32_send_job" handles the send
	 * This needs the job information, so we save using the filesystem
	 * This avoids the child process from having to "connect" to the database again
	 * The file is deleted by the send_job child process when it has done recovering the job
	 */
	job_save_fs(jobp, SAVEJOB_FULLFORCE);	/* so the spawned process can get a fresh copy of job */

	if (*jobp->ji_qs.ji_fileprefix != '\0')
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX);
	else
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_jobid, JOB_FILE_SUFFIX);

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destaddr=%ld\n", hostaddr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destport=%d\n", port);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "move_type=%d\n", move_type);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "in_server=%d\n", is_linked(&svr_alljobs, &jobp->ji_alljobs));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_name=%s\n", (server_name?server_name:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_host=%s\n", (server_host?server_host:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_addr=%ld\n", pbs_server_addr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_port=%d\n", pbs_server_port_dis);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "log_file=%s\n", (log_file?log_file:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_log=%s\n", (path_log?path_log:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_jobs=%s\n", (path_jobs?path_jobs:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_spool=%s\n", (path_spool?path_spool:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_rescdef=%s\n", (path_rescdef?path_rescdef:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_users=%s\n", (path_users?path_users:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_hooks_workdir=%s\n",
		(path_hooks_workdir?path_hooks_workdir:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_enable=%ld\n", svr_history_enable);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_duration=%ld\n", svr_history_duration);
	win_pwrite(&pio, buf, strlen(buf));

	if ( (server.sv_attr[SRV_ATR_ssignon_enable].at_flags & \
                                                ATR_VFLAG_SET) && \
             (server.sv_attr[SRV_ATR_ssignon_enable].at_val.at_long == 1) )
		strcpy(buf, "single_signon_password_enable=1\n");
	else
		strcpy(buf, "single_signon_password_enable=0\n");

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "script_name=%s\n", script_name);
	win_pwrite(&pio, buf, strlen(buf));

	strcpy(buf, "quit\n");
	win_pwrite(&pio, buf, strlen(buf));
	win_pclose2(&pio);	/* closes all handles except the process handle */
	return (2);
#else
	pbs_list_head	 attrl;
	enum conn_type   cntype = ToServerDIS;
	int		 con;
	char		*credbuf = NULL;
	size_t		 credlen = 0;
	char		*destin = jobp->ji_qs.ji_destin;
	int		 encode_type;
	int		 i;
	char		 job_id[PBS_MAXSVRJOBID+1];
	attribute	*pattr;
	pid_t		 pid;
	struct attropl  *pqjatr;      /* list (single) of attropl for quejob */
	char		 script_name[MAXPATHLEN+1];
	struct work_task *ptask;
	struct  hostent *hp;
	struct in_addr   addr;
	long		 tempval;
	int 		gridproxy_cred = 0;
	int 		rpp = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			return -1;
		}
	}

	pid = fork();
	if (pid == -1) {	/* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		pbs_errno = PBSE_SYSTEM;
		return -1;
	}

	if (pid != 0) {		/* The parent (main server) */

		ptask = set_task(WORK_Deferred_Child, pid, post_func, preq);
		if (!ptask) {
			log_err(errno, __func__, msg_err_malloc);
			return (-1);
		} else {
			ptask->wt_parm2 = jobp;
			append_link(&((job *)jobp)->ji_svrtask,
				&ptask->wt_linkobj, ptask);
		}
		return 2;
	}

	/*
	 * the child process
	 *
	 * set up signal cather for error return
	 */
	DBPRT(("%s: child started, sending to port %d\n", __func__, port))
	rpp_terminate();

	/* Unprotect child from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_OFF);

#ifdef WIN32
	/* get host name */
	/*
	 * If host address is loopback address then do not resolve with dns
	 * Use "localhost" as the host name.
	 */
	if ((htonl(hostaddr) == loopback_addr->sin_addr.s_addr)) {
		(void)get_credential(LOCALHOST_SHORTNAME, jobp, PBS_GC_BATREQ,
			&credbuf, &credlen);
	} else {
#endif
		addr.s_addr = htonl(hostaddr);
		hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET);
		if (hp == NULL) {
			sprintf(log_buffer, "%s: h_errno=%d",
				inet_ntoa(addr), h_errno);
			log_err(-1, __func__, log_buffer);
		} else {
			/* read any credential file */
			(void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ,
				&credbuf, &credlen);
		}
#ifdef WIN32
	}
#endif

	/* encode job attributes to be moved */

	CLEAR_HEAD(attrl);

	/* select attributes/resources to send based on move type */

	if (move_type == MOVE_TYPE_Exec) {
		resc_access_perm = ATR_DFLAG_MOM;
		encode_type = ATR_ENCODE_MOM;
		cntype = ToServerDIS;
	} else {
		resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR |
			ATR_DFLAG_MGWR | ATR_DFLAG_SvRD;
		encode_type = ATR_ENCODE_SVR;
		svr_dequejob(jobp);	/* clears default resource settings */
	}

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	pattr = jobp->ji_wattr;
	for (i=0; i < (int)JOB_ATR_LAST; i++) {
		if ((job_attr_def+i)->at_flags & resc_access_perm) {
			(void)(job_attr_def+i)->at_encode(pattr+i, &attrl,
				(job_attr_def+i)->at_name, (char *)0,
				encode_type, NULL);
		}
	}
	attrl_fixlink(&attrl);


	/* save the job id for when after we purge the job */

	(void)strcpy(job_id, jobp->ji_qs.ji_jobid);

	pbs_errno = 0;
	con = -1;

	for (i=0; i<RETRY; i++) {

		/* connect to receiving server with retries */

		if (i > 0) {	/* recycle after an error */
			if (con >= 0)
				svr_disconnect(con);
			if (should_retry_route(pbs_errno) == -1) {
				/* delete the temp script file */
				unlink(script_name);
				exit(SEND_JOB_FATAL);	/* fatal error, don't retry */
			}
			sleep(1<<i);
		}
		if ((con = svr_connect(hostaddr, port, 0, cntype, rpp)) ==
			PBS_NET_RC_FATAL) {
			(void)sprintf(log_buffer, "send_job failed to %lx port %d",
				hostaddr, port);
			log_err(pbs_errno, __func__, log_buffer);

			/* delete the temp script file */
			unlink(script_name);

			if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == PBSE_BADCRED))
				exit(SEND_JOB_NODEDW);

			exit(SEND_JOB_FATAL);
		} else if (con == PBS_NET_RC_RETRY) {
			pbs_errno = ECONNREFUSED;	/* should retry */
			continue;
		}

		/*
		 * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
		 * we are recovering after being down or a late failure, we
		 * just want to send the commit"
		 */

		if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) {

			if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) {
				jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
			}

			pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;
			if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin,
				pqjatr, (char *)0, rpp, NULL) == 0) {
				if (pbs_errno == PBSE_JOBEXIST &&
					move_type == MOVE_TYPE_Exec) {
					/* already running, mark it so */
					log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						"Mom reports job already running");
					exit(SEND_JOB_OK);
				}
				else if ((pbs_errno == PBSE_HOOKERROR) ||
					(pbs_errno == PBSE_HOOK_REJECT)  ||
					(pbs_errno == PBSE_HOOK_REJECT_RERUNJOB)  ||
					(pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) {
					char		name_buf[MAXPATHLEN+1];
					int		rfd;
					int		len;
					char		*reject_msg;
					int		err;

					err = pbs_errno;

					reject_msg = pbs_geterrmsg(con);
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d reject_msg=%s",
						destin, err,
						reject_msg?reject_msg:"");
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);

					(void)strcpy(name_buf, path_hooks_workdir);
					(void)strcat(name_buf, jobp->ji_qs.ji_jobid);
					(void)strcat(name_buf, HOOK_REJECT_SUFFIX);

					if ((reject_msg != NULL) &&
						(reject_msg[0] != '\0')) {

						if ((rfd = open(name_buf,
							O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) {
							sprintf(log_buffer,
								"open of reject file %s failed: errno %d",
								name_buf, errno);
							log_event(PBSEVENT_JOB,
								PBS_EVENTCLASS_JOB,
								LOG_INFO, jobp->ji_qs.ji_jobid,
								log_buffer);
						} else {
#ifdef WIN32
							secure_file(name_buf, "Administrators",
								READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
							setmode(rfd, O_BINARY);
#endif
							len = strlen(reject_msg)+1;
							/* write also trailing null char */
							if (write(rfd, reject_msg, len) != len) {
								sprintf(log_buffer,
									"write to file %s incomplete: errno %d", name_buf, errno);
								log_event(PBSEVENT_JOB,
									PBS_EVENTCLASS_JOB,
									LOG_INFO, jobp->ji_qs.ji_jobid,
									log_buffer);
							}
							close(rfd);
						}
					}

					if (err == PBSE_HOOKERROR)
						exit(SEND_JOB_HOOKERR);
					if (err == PBSE_HOOK_REJECT)
						exit(SEND_JOB_HOOK_REJECT);
					if (err == PBSE_HOOK_REJECT_RERUNJOB)
						exit(SEND_JOB_HOOK_REJECT_RERUNJOB);
					if (err == PBSE_HOOK_REJECT_DELETEJOB)
						exit(SEND_JOB_HOOK_REJECT_DELETEJOB);
				}
				else {
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d",
						destin, pbs_errno);
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);
					continue;
				}
			}

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				if (PBSD_jscript(con, script_name, rpp, NULL) != 0)
					continue;
			}

			if (credlen > 0) {
				int	ret;

				ret = PBSD_jcred(con,
					jobp->ji_extended.ji_ext.ji_credtype,
					credbuf, credlen, rpp, NULL);
				if ((ret == 0) || (i == (RETRY - 1)))
					free(credbuf);	/* free credbuf if cred info is sent successfully OR */
				/* at the end of all retry attempts */
				if (ret != 0)
					continue;
			}

			if ((move_type == MOVE_TYPE_Exec) &&
				(jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
				(hostaddr !=  pbs_server_addr)) {
				/* send files created on prior run */
				if ((move_job_file(con, jobp, StdOut, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, StdErr, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, Chkpt, rpp, NULL) != 0))
					continue;
			}

			jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;
		}

		if (PBSD_rdytocmt(con, job_id, rpp, NULL) != 0)
			continue;

		if (PBSD_commit(con, job_id, rpp, NULL) != 0) {
			/* delete the temp script file */
			unlink(script_name);
			exit(SEND_JOB_FATAL);
		}
		svr_disconnect(con);

		/* delete the temp script file */
		unlink(script_name);

		exit(SEND_JOB_OK);	/* This child process is all done */
	}
	if (con >= 0)
		svr_disconnect(con);
	/*
	 * If connection is actively refused by the execution node(or mother superior) OR
	 * the execution node(or mother superior) is rejecting request with error
	 * PBSE_BADHOST(failing to authorize server host), the node should be marked down.
	 */
	if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == ECONNREFUSED  || pbs_errno == PBSE_BADHOST)) {
		i = SEND_JOB_NODEDW;
	} else if (should_retry_route(pbs_errno) == -1) {
		i = SEND_JOB_FATAL;
	} else {
		i = SEND_JOB_RETRY;
	}
	(void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno);
	log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE,
		jobp->ji_qs.ji_jobid, log_buffer);

	/* delete the temp script file */
	unlink(script_name);

	exit(i);
	return -1;		/* NOT REACHED */

#endif /* !WIN32 */
}
static void post_movejob(

  struct work_task *pwt)

  {
  char *id = "post_movejob";

  struct batch_request *req;
  int newstate;
  int newsub;
  int stat;
  int r;
  job *jobp;

  req  = (struct batch_request *)pwt->wt_parm2;

  stat = pwt->wt_aux;

  pbs_errno = PBSE_NONE;

  if (req->rq_type != PBS_BATCH_MoveJob)
    {
    sprintf(log_buffer, "bad request type %d\n",
            req->rq_type);

    log_err(-1, id, log_buffer);

    return;
    }

  jobp = find_job(req->rq_ind.rq_move.rq_jid);

  if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm1))
    {
    sprintf(log_buffer, "job %s not found\n",
            req->rq_ind.rq_move.rq_jid);

    log_err(-1, id, log_buffer);
    }

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);

    if (r == 0)
      {
      /* purge server's job structure */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(jobp);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
        remove_checkpoint(jobp);

      strcpy(log_buffer, msg_movejob);

      sprintf(log_buffer + strlen(log_buffer), msg_manager,
              req->rq_ind.rq_move.rq_destin,
              req->rq_user,
              req->rq_host);

      job_purge(jobp);
      }
    else
      {
      r = PBSE_ROUTEREJ;
      }
    }
  else
    {
    r = PBSE_SYSTEM;

    sprintf(log_buffer, msg_badexit, stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  if (r)
    {
    if (jobp != NULL)
      {
      /* force re-eval of job state out of Transit */

      svr_evaljobstate(jobp, &newstate, &newsub, 1);
      svr_setjobstate(jobp, newstate, newsub);
      }

    req_reject(r, 0, req, NULL, NULL);
    }
  else
    {
    reply_ack(req);
    }

  return;
  }  /* END post_movejob() */
Exemple #10
0
void
req_modifyjob(struct batch_request *preq)
{
	int		 add_to_am_list = 0; /* if altered during sched cycle */
	int		 bad = 0;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsubstate;
	resource_def	*outsideselect = NULL;
	job		*pjob;
	svrattrl	*plist;
	resource	*presc;
	resource_def	*prsd;
	int		 rc;
	int		 running = 0;
	int		 sendmom = 0;
	char		hook_msg[HOOK_MSG_SIZE];
	int		mod_project = 0;
	pbs_sched	*psched;

	switch (process_hooks(preq, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(preq, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(preq) == -1) { /* error */
				/* we have to reject the request, as 'preq' */
				/* may have been partly modified            */
				strcpy(hook_msg,
					"modifyjob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(preq, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "modifyjob event: accept req by default");
	}

	if (pseldef == NULL)  /* do one time to keep handy */
		pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size);

	pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt);
	if (pjob == NULL)
		return;

	if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	psched = find_sched_from_sock(preq->rq_conn);
	/* allow scheduler to modify job */
	if (psched == NULL) {
		/* provisioning job is not allowed to be modified */
		if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
			(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}

	/* cannot be in exiting or transit, exiting has already be checked */

	if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	if (plist == NULL) {	/* nothing to do */
		reply_ack(preq);
		return;
	}

	/*
	 * Special checks must be made:
	 *	if during a scheduling cycle and certain attributes are altered,
	 *	   make a note of the job to prevent it from being run now;
	 *	if job is running, only certain attributes/resources can be
	 *	   altered.
	 */

	if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {
		running = 1;
	}
	while (plist) {
		int i;

		i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

		/*
		 * Is the attribute being altered one which could change
		 * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling
		 * cycle is in progress, then set flag to add the job to list
		 * of jobs which cannot be run in this cycle.
		 * If the scheduler itself sends a modify job request,
		 * no need to delay the job until next cycle.
		 */
		if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT))
			add_to_am_list = 1;

		/* Is the attribute modifiable in RUN state ? */

		if (i < 0) {
			reply_badattr(PBSE_NOATTR, 1, plist, preq);
			return;
		}
		if ((running == 1) &&
			((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) {

			reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
			return;
		}
		if (i == (int)JOB_ATR_resource) {

			prsd = find_resc_def(svr_resc_def, plist->al_resc,
				svr_resc_size);

			if (prsd == 0) {
				reply_badattr(PBSE_UNKRESC, 1, plist, preq);
				return;
			}

			/* is the specified resource modifiable while */
			/* the job is running                         */

			if (running) {

				if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) {
					reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
					return;
				}

				sendmom = 1;
			}

			/* should the resource be only in a select spec */

			if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect &&
				plist->al_atopl.value && plist->al_atopl.value[0]) {
				/* if "-lresource" is set and has non-NULL value,
				** remember as potential bad resource
				** if this appears along "select".
				*/
				outsideselect = prsd;
			}
		}
		if (strcmp(plist->al_name, ATTR_project) == 0) {
			mod_project = 1;
		} else if ((strcmp(plist->al_name, ATTR_runcount) == 0) &&
			((plist->al_flags & ATR_VFLAG_HOOK) == 0) &&
			(plist->al_value != NULL) &&
			(plist->al_value[0] != '\0') &&
			((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) &&
		(atol(plist->al_value) < \
		    pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) {
			sprintf(log_buffer,
				"regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld",
				preq->rq_user, preq->rq_host, ATTR_runcount,
				pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long,
				atol(plist->al_value));
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR,
				pjob->ji_qs.ji_jobid, log_buffer);
			req_reject(PBSE_PERM, 0, preq);
			return;
		}
		plist = (svrattrl *)GET_NEXT(plist->al_link);
	}

	if (outsideselect) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc &&
			((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) {
			/* select is not a default, so reject qalter */

			resc_in_err = strdup(outsideselect->rs_name);
			req_reject(PBSE_INVALJOBRESC, 0, preq);
			return;
		}

	}

	/* modify the jobs attributes */

	bad = 0;
	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);
	if (rc) {
		if (pjob->ji_clterrmsg)
			reply_text(preq, rc, pjob->ji_clterrmsg);
		else
			reply_badattr(rc, bad, plist, preq);
		return;
	}

	/* If certain attributes modified and if in scheduling cycle  */
	/* then add to list of jobs which cannot be run in this cycle */

	if (add_to_am_list)
		am_jobs_add(pjob);	/* see req_runjob() */

	/* check if project attribute was requested to be modified to */
	/* be the default project value */
	if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \
							ATR_VFLAG_SET)) {

		if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str,
			PBS_DEFAULT_PROJECT) == 0) {
			sprintf(log_buffer, msg_defproject,
				ATTR_project, PBS_DEFAULT_PROJECT);
#ifdef NAS /* localmod 107 */
			log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#else
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#endif /* localmod 107 */
		}
	}

	if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) {
			/* changing Resource_List and select is a default   */
			/* clear "select" so it is rebuilt inset_resc_deflt */
			pseldef->rs_free(&presc->rs_value);
		}
	}

	/* Reset any defaults resource limit which might have been unset */
	if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if job is not running, may need to change its state */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		(void)job_save(pjob, SAVEJOB_FULL);
	}
	(void)sprintf(log_buffer, msg_manager, msg_jobmod,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* if a resource limit changed for a running job, send to MOM */

	if (sendmom) {
		rc = relay_to_mom(pjob, preq, post_modify_req);
		if (rc)
			req_reject(rc, 0, preq);    /* unable to get to MOM */
		return;
	}

	reply_ack(preq);
}
Exemple #11
0
void process_hold_reply(

  batch_request *preq)

  {
  job                  *pjob;
  pbs_attribute         temphold;

  int                   newstate;
  int                   newsub;
  int                   rc;
  char                 *pset;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* preq was handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    if (preq->rq_reply.brp_code != 0)
      {
      rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold);
      
      if (rc == 0)
        {
        rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold],
            &temphold, DECR);
        }
      
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */
      
      pjob->ji_modified = 1;    /* indicate attributes changed */
      svr_evaljobstate(*pjob, newstate, newsub, 0);
      svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
      
      if (preq->rq_reply.brp_code != PBSE_NOSUP)
        {
        sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code);
        log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf);
        }
      else
        {
        reply_ack(preq);
        }
      }
    else
      {
      /* record that MOM has a checkpoint file */
      
      /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
       * And if these flags are not set, start_exec will not try to run the job from
       * the checkpoint image file.
       */
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
      
      if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
        {
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
        pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
        }

      pjob->ji_modified = 1;    /* indicate attributes changed     */
      
      svr_evaljobstate(*pjob, newstate, newsub, 0);
      svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
      
      account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
      reply_ack(preq);
      }
    }

  } /* END process_hold_reply() */
static void post_stagein(

  struct work_task *pwt)

  {
  int        code;
  int        newstate;
  int        newsub;
  job       *pjob;

  struct batch_request *preq;
  attribute      *pwait;

  preq = pwt->wt_parm1;
  code = preq->rq_reply.brp_code;
  pjob = find_job(preq->rq_extra);

  free(preq->rq_extra);

  if (pjob != NULL)
    {
    if (code != 0)
      {
      /* stage in failed - hold job */

      free_nodes(pjob);

      pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime];

      if ((pwait->at_flags & ATR_VFLAG_SET) == 0)
        {
        pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT;

        pwait->at_flags |= ATR_VFLAG_SET;

        job_set_wait(pwait, pjob, 0);
        }

      svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL);

      if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text)
        {
        /* set job comment */

        /* NYI */

        svr_mailowner(
          pjob,
          MAIL_STAGEIN,
          MAIL_FORCE,
          preq->rq_reply.brp_un.brp_txt.brp_str);
        }
      }
    else
      {
      /* stage in was successful */

      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_StagedIn;

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO)
        {
        if (is_checkpoint_restart(pjob))
          {
          /* need to copy checkpoint file to mom before running */
          svr_send_checkpoint(
              pjob,
              preq,
              JOB_STATE_RUNNING,
              JOB_SUBSTATE_CHKPTGO);
          }
        else
          {
          /* continue to start job running */

          svr_strtjob2(pjob, NULL);
          }
        }
      else
        {
        svr_evaljobstate(pjob, &newstate, &newsub, 0);

        svr_setjobstate(pjob, newstate, newsub);
        }
      }
    }    /* END if (pjob != NULL) */

  release_req(pwt); /* close connection and release request */

  return;
  }  /* END post_stagein() */
static void post_sendmom(

  struct work_task *pwt)  /* I */

  {
  char *id = "post_sendmom";

  int  newstate;
  int  newsub;
  int  r;
  int  stat;
  job *jobp = (job *)pwt->wt_parm1;

  struct batch_request *preq = (struct batch_request *)pwt->wt_parm2;

  char  *MOMName = NULL;

  int    jindex;
  long DTime = time_now - 10000;

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "entering post_sendmom");
    }

  stat = pwt->wt_aux;

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);
    }
  else
    {
    r = 2;

    /* cannot get child exit status */

    sprintf(log_buffer, msg_badexit,
            stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  /* maintain local struct to associate job id with dispatch time */

  for (jindex = 0;jindex < 20;jindex++)
    {
    if (DispatchJob[jindex] == jobp)
      {
      DTime = DispatchTime[jindex];

      DispatchJob[jindex] = NULL;

      MOMName = DispatchNode[jindex];

      break;
      }
    }

  if (LOGLEVEL >= 1)
    {
    sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d",
            (r == 0) ? "success" : "failure",
            time_now - DTime,
            (MOMName != NULL) ? MOMName : "???",
            r);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  switch (r)
    {

    case 0:  /* send to MOM went ok */

      jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART;

      if (preq != NULL)
        reply_ack(preq);

      /* record start time for accounting */

      jobp->ji_qs.ji_stime = time_now;

      /* update resource usage attributes */

      set_resc_assigned(jobp, INCR);

      if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
        {
        /* may be EXITING if job finished first */

        svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);

        /* above saves job structure */
        }

      /* accounting log for start or restart */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
        account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint");
      else
        account_jobstr(jobp);

      /* if any dependencies, see if action required */

      if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET)
        depend_on_exec(jobp);

      /*
       * it is unfortunate, but while the job has gone into execution,
       * there is no way of obtaining the session id except by making
       * a status request of MOM.  (Even if the session id was passed
       * back to the sending child, it couldn't get up to the parent.)
       */

      jobp->ji_momstat = 0;

      stat_mom_job(jobp);

      break;

    case 10:

      /* NOTE: if r == 10, connection to mom timed out.  Mark node down */

      stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0);

      /* send failed, requeue the job */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        jobp->ji_qs.ji_jobid,
        "unable to run job, MOM rejected/timeout");

      free_nodes(jobp);

      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT)
        {
        if (preq != NULL)
          req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out");

        svr_evaljobstate(jobp, &newstate, &newsub, 1);

        svr_setjobstate(jobp, newstate, newsub);
        }
      else
        {
        if (preq != NULL)
          req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom");
        }

      break;

    case 1:   /* commit failed */

    default:

      {
      int JobOK = 0;

      /* send failed, requeue the job */

      sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d",
              r);

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        jobp->ji_qs.ji_jobid,
        log_buffer);

      free_nodes(jobp);

      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT)
        {
        if (preq != NULL)
          {
          char tmpLine[1024];

          if (preq->rq_reply.brp_code == PBSE_JOBEXIST)
            {
            /* job already running, start request failed but return success since
             * desired behavior (job is running) is accomplished */

            JobOK = 1;
            }
          else
            {
            sprintf(tmpLine, "cannot send job to %s, state=%s",
                    (MOMName != NULL) ? MOMName : "mom",
                    PJobSubState[jobp->ji_qs.ji_substate]);

            req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine);
            }
          }

        if (JobOK == 1)
          {
          /* do not re-establish accounting - completed first time job was started */

          /* update mom-based job status */

          jobp->ji_momstat = 0;

          stat_mom_job(jobp);
          }
        else
          {
          svr_evaljobstate(jobp, &newstate, &newsub, 1);

          svr_setjobstate(jobp, newstate, newsub);
          }
        }
      else
        {
        if (preq != NULL)
          req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort");
        }

      break;
      }
    }  /* END switch (r) */

  return;
  }  /* END post_sendmom() */
Exemple #14
0
/**
 * update_array_values()
 *
 * updates internal bookeeping values for job arrays
 * @param pa - array to update
 * @param pjob - the pjob that an event happened on
 * @param event - code for what event just happened
 */
void update_array_values(

  job_array            *pa,        /* I */
  int                   old_state, /* I */
  enum ArrayEventsEnum  event,     /* I */
  char                 *job_id,
  long                  job_atr_hold,
  int                   job_exit_status)

  {
  long  moab_compatible;

  switch (event)
    {
    case aeQueue:

      /* NYI, nothing needs to be done for this yet */

      break;

    case aeRun:

      if (old_state != JOB_STATE_RUNNING)
        {
        pa->ai_qs.jobs_running++;
        pa->ai_qs.num_started++;
        }

      break;

    case aeTerminate:

      if (old_state == JOB_STATE_RUNNING)
        {
        if (pa->ai_qs.jobs_running > 0)
          pa->ai_qs.jobs_running--;
        }

      if (job_exit_status == 0)
        {
        pa->ai_qs.num_successful++;
        pa->ai_qs.jobs_done++;
        }
      else
        {
        pa->ai_qs.num_failed++;
        pa->ai_qs.jobs_done++;
        }

      array_save(pa);

      /* update slot limit hold if necessary */
      if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE)
        moab_compatible = FALSE;

      if (moab_compatible != FALSE)
        {
        /* only need to update if the job wasn't previously held */
        if ((job_atr_hold & HOLD_l) == FALSE)
          {
          int  i;
          int  newstate;
          int  newsub;
          job *pj;

          /* find the first held job and release its hold */
          for (i = 0; i < pa->ai_qs.array_size; i++)
            {
            if (pa->job_ids[i] == NULL)
              continue;

            if (!strcmp(pa->job_ids[i], job_id))
              continue;

            if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL)
              {
              free(pa->job_ids[i]);
              pa->job_ids[i] = NULL;
              }
            else
              {
              if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
                {
                pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
                
                if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                  {
                  pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                  }
                
                svr_evaljobstate(pj, &newstate, &newsub, 1);
                svr_setjobstate(pj, newstate, newsub, FALSE);
                job_save(pj, SAVEJOB_FULL, 0);
                unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
                
                break;
                }

              unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
              }
            }
          }
        }

      break;

    default:

      /* log error? */

      break;
    }

  set_array_depend_holds(pa);
  array_save(pa);

  } /* END update_array_values() */
Exemple #15
0
/**
 * @brief
 * 		post_routejob - clean up action for child started in net_move/send_job
 *		   to "route" a job to another server
 * @par
 * 		If route was successfull, delete job.
 * @par
 * 		If route didn't work, mark destination not to be tried again for this
 * 		job and call route again.
 *
 * @param[in]	pwt	-	work task structure
 *
 * @return	none.
 */
static void
post_routejob(struct work_task *pwt)
{
	int	 newstate;
	int	 newsub;
	int	 r;
	int	 stat = pwt->wt_aux;
	job	*jobp = (job *)pwt->wt_parm2;

	if (jobp == NULL) {
		log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, "", "post_routejob failed, jobp NULL");
		return;
	}

	if (WIFEXITED(stat)) {
		r = WEXITSTATUS(stat);
	} else {
		r = SEND_JOB_FATAL;
		(void)sprintf(log_buffer, msg_badexit, stat);
		(void)strcat(log_buffer, __func__);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE,
			jobp->ji_qs.ji_jobid, log_buffer);
	}

	switch (r) {
		case SEND_JOB_OK:		/* normal return, job was routed */

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
				remove_stagein(jobp);
			/*
			 * If the server is configured to keep job history and the job
			 * is created here, do not purge the job structure but save
			 * it for history purpose. No need to check for sub-jobs as
			 * sub jobs can not be routed.
			 */
			if (svr_chk_history_conf())
				svr_setjob_histinfo(jobp, T_MOV_JOB);
			else
				job_purge(jobp); /* need to remove server job struct */
			return;
		case SEND_JOB_FATAL:		/* permanent rejection (or signal) */
			if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) {

				/* Job Delete in progress, just set to queued status */

				(void)svr_setjobstate(jobp, JOB_STATE_QUEUED,
					JOB_SUBSTATE_ABORT);
				return;
			}
			add_dest(jobp);		/* else mark destination as bad */
			/* fall through */
		default :	/* try routing again */
			/* force re-eval of job state out of Transit */
			svr_evaljobstate(jobp, &newstate, &newsub, 1);
			(void)svr_setjobstate(jobp, newstate, newsub);
			jobp->ji_retryok = 1;
			if ((r = job_route(jobp)) == PBSE_ROUTEREJ)
				(void)job_abt(jobp, msg_routebad);
			else if (r != 0)
				(void)job_abt(jobp, msg_routexceed);
			break;
	}
	return;
}
Exemple #16
0
END_TEST

START_TEST(svr_evaljobstate_test)
  {
  struct job test_job;
  int state = 0;
  int substate = 0;

  memset(&test_job, 0, sizeof(test_job));

  test_job.ji_qs.ji_state = JOB_STATE_RUNNING;
  svr_evaljobstate(test_job, state, substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 1");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 1");
  memset(&test_job, 0, sizeof(test_job));

  test_job.ji_wattr[JOB_ATR_hold].at_val.at_long = 1;
  svr_evaljobstate(test_job, state, substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 2");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 2");
  memset(&test_job, 0, sizeof(test_job));

  test_job.ji_wattr[JOB_ATR_stagein].at_flags = 1;
  svr_evaljobstate(test_job, state, substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 3");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 3");
  memset(&test_job, 0, sizeof(test_job));

  svr_evaljobstate(test_job, state, substate, 0);
  fail_unless(test_job.ji_qs.ji_state == state, "svr_setjobstate state fail case 4");
  fail_unless(test_job.ji_qs.ji_substate == substate, "svr_setjobstate substate fail case 4");
  memset(&test_job, 0, sizeof(test_job));

  svr_evaljobstate(test_job, state, substate, 1);
  fail_unless(JOB_STATE_QUEUED == state, "svr_setjobstate state fail case 5");
  fail_unless(JOB_SUBSTATE_QUEUED == substate, "svr_setjobstate substate fail case 5");

  int old_state;
  int old_substate;
  test_job.ji_qs.ji_state = JOB_STATE_EXITING;
  test_job.ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
  old_state = test_job.ji_qs.ji_state;
  old_substate = test_job.ji_qs.ji_substate;
  svr_evaljobstate(test_job, state, substate, 1);
  fail_unless(old_state == state);
  fail_unless(old_substate == substate);

  test_job.ji_qs.ji_state = JOB_STATE_EXITING;
  test_job.ji_qs.ji_substate = JOB_SUBSTATE_RERUN3;
  old_state = test_job.ji_qs.ji_state;
  old_substate = test_job.ji_qs.ji_substate;
  svr_evaljobstate(test_job, state, substate, 1);
  fail_unless(state == JOB_STATE_QUEUED);
  fail_unless(substate == JOB_SUBSTATE_QUEUED);


  test_job.ji_qs.ji_state = JOB_STATE_COMPLETE;
  test_job.ji_qs.ji_substate = JOB_SUBSTATE_COMPLETE;
  old_state = test_job.ji_qs.ji_state;
  old_substate = test_job.ji_qs.ji_substate;
  svr_evaljobstate(test_job, state, substate, 1);
  fail_unless(old_state == state);
  fail_unless(old_substate == substate);
  }
Exemple #17
0
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
Exemple #18
0
void
req_holdjob(struct batch_request *preq)
{
	long		*hold_val;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsub;
	long		 old_hold;
	job		*pjob;
	char		*pset;
	int		 rc;
	char             date[32];
	time_t           now;


	pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq, &jt);
	if (pjob == (job *)0)
		return;
	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}
	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}


	/* cannot do anything until we decode the holds to be set */

	if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if other than HOLD_u is being set, must have privil */

	if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* HOLD_bad_password can only be done by root or admin */
#ifdef WIN32
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
				!isAdminPrivilege(preq->rq_user) )
#else
	if ( (temphold.at_val.at_long & HOLD_bad_password) && \
		  strcasecmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0 )
#endif
	{
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	hold_val = &pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;
	old_hold = *hold_val;
	*hold_val |= temphold.at_val.at_long;
	pjob->ji_wattr[(int)JOB_ATR_hold].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODCACHE;

	/* Note the hold time in the job comment. */
	now = time(NULL);
	(void)strncpy(date, (const char *)ctime(&now), 24);
	date[24] = '\0';
	(void)sprintf(log_buffer, "Job held by %s on %s", preq->rq_user, date);
	job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment], (char *)0, (char *)0, log_buffer);

	(void)sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
		preq->rq_host);

	if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
		(pjob->ji_qs.ji_substate != JOB_SUBSTATE_PRERUN) &&
		(pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str) &&
		(*pjob->ji_wattr[(int)JOB_ATR_chkpnt].at_val.at_str != 'n')) {

		/* have MOM attempt checkpointing */

		if ((rc = relay_to_mom(pjob, preq, post_hold)) != 0) {
			*hold_val = old_hold;	/* reset to the old value */
			req_reject(rc, 0, preq);
		} else {
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
			pjob->ji_qs.ji_svrflags |=
				JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT;
			(void)job_save(pjob, SAVEJOB_QUICK);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
		}
	} else {

		/* every thing went well, may need to update the job state */

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
			pjob->ji_qs.ji_jobid, log_buffer);
		if (old_hold != *hold_val) {
			/* indicate attributes changed     */
			pjob->ji_modified = 1;
			svr_evaljobstate(pjob, &newstate, &newsub, 0);
			(void)svr_setjobstate(pjob, newstate, newsub);
		}
		reply_ack(preq);
	}
}
Exemple #19
0
/*
 * release_job - releases the hold on job j
 * @param j - the job to modify
 * @param pa - a pointer to an array whose mutex we hold - always this job's array
 * @return 0 if successful, a PBS error on failure
 */
int release_job(

  struct batch_request *preq, /* I */
  void                 *j,    /* I/O */
  job_array            *pa)   /* I */

  {
  long           old_hold;
  int            rc = PBSE_NONE;
  int            newstate;
  int            newsub;
  char          *pset;
  job           *pjob = (job *)j;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  pbs_attribute  temphold;

  // this function is meaningless for jobs in exiting or completed
  if (pjob->ji_qs.ji_state > JOB_STATE_RUNNING)
    return(PBSE_NONE);

  /* cannot do anything until we decode the holds to be set */

  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold)) != 0)
    {
    return(rc);
    }

  /* if other than HOLD_u is being released, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    return(rc);
    }

  /* unset the hold */

  old_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  if ((rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold], &temphold, DECR)))
    {
    return(rc);
    }

  if (pjob->ji_arraystructid[0] != '\0')
    {
    // Make sure our slot limit counts are correct
    check_array_slot_limits(pjob, pa);
    }

  /* everything went well, if holds changed, update the job state */

  if (old_hold != pjob->ji_wattr[JOB_ATR_hold].at_val.at_long)
    {
    pjob->ji_modified = 1; /* indicates attributes changed */

    svr_evaljobstate(*pjob, newstate, newsub, 0);

    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
    }

  sprintf(log_buf, msg_jobholdrel,
    pset,
    preq->rq_user,
    preq->rq_host);

  log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

  return(rc);
  } /* END release_job() */
Exemple #20
0
void
req_releasejob(struct batch_request *preq)
{
	int              jt;            /* job type */
	int		 newstate;
	int		 newsub;
	long		 old_hold;
	job		*pjob;
	char		*pset;
	int		 rc;


	pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq, &jt);
	if (pjob == (job *)0)
		return;

	if ((jt != IS_ARRAY_NO) && (jt != IS_ARRAY_ArrayJob)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	/* cannot do anything until we decode the holds to be set */

	if ((rc=get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if other than HOLD_u is being released, must have privil */

	if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* all ok so far, unset the hold */

	old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;
	rc = job_attr_def[(int)JOB_ATR_hold].
		at_set(&pjob->ji_wattr[(int)JOB_ATR_hold],
		&temphold, DECR);
	if (rc) {
		req_reject(rc, 0, preq);
		return;
	}

	/* every thing went well, if holds changed, update the job state */

#ifndef NAS /* localmod 105 Always reset etime on release */
	if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long) {
#endif /* localmod 105 */
#ifdef NAS /* localmod 105 */
		{
			attribute *etime = &pjob->ji_wattr[(int)JOB_ATR_etime];
			etime->at_val.at_long = time_now;
			etime->at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE;
#endif /* localmod 105 */
		pjob->ji_modified = 1;	/* indicates attributes changed    */
		svr_evaljobstate(pjob, &newstate, &newsub, 0);
		(void)svr_setjobstate(pjob, newstate, newsub); /* saves job */
	}
	if (pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long == 0)
		job_attr_def[(int)JOB_ATR_Comment].at_free(&pjob->ji_wattr[(int)JOB_ATR_Comment]);
	(void)sprintf(log_buffer, msg_jobholdrel, pset, preq->rq_user,
		preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);
	reply_ack(preq);
}

/**
 * @brief
 * 		get_hold - search a list of attributes (svrattrl) for the hold-types
 * 		attribute.  This is used by the Hold Job and Release Job request,
 *		therefore it is an error if the hold-types attribute is not present,
 *		or there is more than one.
 *
 *		Decode the hold attribute into temphold.
 *
 * @param[in]	phead	- pbs list head.
 * @param[out]	phead	- RETURN - ptr to hold value
 *
 * @return	error code
 */

static int
get_hold(pbs_list_head *phead, char	 **pset)
{
	int		 have_one = 0;
	struct svrattrl *holdattr = (struct svrattrl*)0;
	struct svrattrl *pal;

	pal = (struct svrattrl *)GET_NEXT((*phead));
	while (pal) {
		if (!strcasecmp(pal->al_name, job_attr_def[(int)JOB_ATR_hold].at_name)) {
			holdattr = pal;
			*pset    = pal->al_value;
			have_one++;
		} else {
			return (PBSE_IVALREQ);
		}
		pal = (struct svrattrl *)GET_NEXT(pal->al_link);
	}
	if (have_one != 1)
		return (PBSE_IVALREQ);

	/* decode into temporary attribute structure */

	clear_attr(&temphold, &job_attr_def[(int)JOB_ATR_hold]);
	return (job_attr_def[(int)JOB_ATR_hold].at_decode(
		&temphold,
		holdattr->al_name,
		(char *)0,
		holdattr->al_value));
}
Exemple #21
0
/**
 * update_array_values()
 *
 * updates internal bookeeping values for job arrays
 * @param pa - array to update
 * @param pjob - the pjob that an event happened on
 * @param event - code for what event just happened
 */
void update_array_values(

  job_array            *pa,        /* I */
  void                 *j,         /* I */
  int                   old_state, /* I */
  enum ArrayEventsEnum  event)     /* I */

  {
  job *pjob = (job *)j;
  int exit_status;

  switch (event)
    {
    case aeQueue:

      /* NYI, nothing needs to be done for this yet */

      break;

    case aeRun:

      if (old_state != JOB_STATE_RUNNING)
        {
        pa->ai_qs.jobs_running++;
        pa->ai_qs.num_started++;
        }

      break;

    case aeTerminate:

      exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat;
      if (old_state == JOB_STATE_RUNNING)
        {
        if (pa->ai_qs.jobs_running > 0)
          pa->ai_qs.jobs_running--;
        }

      if (exit_status == 0)
        {
        pa->ai_qs.num_successful++;
        pa->ai_qs.jobs_done++;
        }
      else
        {
        pa->ai_qs.num_failed++;
        pa->ai_qs.jobs_done++;
        }

      array_save(pa);

      /* update slot limit hold if necessary */
      if (server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE)
        {
        /* only need to update if the job wasn't previously held */
        if ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE)
          {
          int  i;
          int  newstate;
          int  newsub;
          job *pj;

          /* find the first held job and release its hold */
          for (i = 0; i < pa->ai_qs.array_size; i++)
            {
            if (pa->jobs[i] == NULL)
              continue;

            pj = (job *)pa->jobs[i];

            if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
              {
              pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;

              if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                {
                pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                }
             
              svr_evaljobstate(pj, &newstate, &newsub, 1);
              svr_setjobstate(pj, newstate, newsub);
              job_save(pj, SAVEJOB_FULL, 0);

              break;
              }
            }
          }
        }

      break;

    default:

      /* log error? */

      break;
    }

  set_array_depend_holds(pa);
  array_save(pa);

  } /* END update_array_values() */
void finish_routing_processing(

  job *pjob,
  int  status)

  {
  int          newstate;
  int          newsub;

  if (pjob == NULL)
    return;

  if (LOGLEVEL >= 10)
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid);

  switch (status)
    {
    case LOCUTION_SUCCESS:  /* normal return, job was routed */

      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(&pjob);

      if (pjob != NULL)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
          remove_checkpoint(&pjob);

        if (pjob != NULL)
          svr_job_purge(pjob); /* need to remove server job struct */
        }

      break;

    case LOCUTION_FAIL:  /* permanent rejection (or signal) */

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT)
        {
        /* job delete in progress, just set to queued status */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        
        svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return;
        }

      add_dest(pjob);  /* else mark destination as bad */

      /* fall through */

    default: /* try routing again */
       
      svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

      /* force re-eval of job state out of Transit */

      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);

      if ((status = job_route(pjob)) == PBSE_ROUTEREJ)
        job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
      else if (status != 0)
        job_abt(&pjob, msg_routexceed);
      else
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);


      break;
    }  /* END switch (status) */

  return;
  } /* END finish_routing_processing() */
Exemple #23
0
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */
Exemple #24
0
int finalize_rerunjob(
    
  batch_request *preq,
  job           *pjob,
  int            rc)

  {
  int       Force;
  char      log_buf[LOCAL_LOG_BUF_SIZE];

  if (pjob == NULL)
    return(PBSE_BAD_PARAMETER);

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case -1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;

        if ((cray_enabled == true) &&
            (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
        else
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);

        /* Cannot communicate with MOM, forcibly requeue job.
           This is a relatively disgusting thing to do */

        sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
          tmp, rc);

        free(tmp);

        log_event(
          PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buf);

        log_err(-1, __func__, log_buf);

        strcat(log_buf, ", previous output files may be lost");

        svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);

        svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);

        rel_resc(pjob); /* free resc assigned to job */

        pjob->ji_modified = 1;    /* force full job save */

        pjob->ji_momhandle = -1;
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

        svr_evaljobstate(*pjob, newstate, newsubst, 0);
        svr_setjobstate(pjob, newstate, newsubst, FALSE);
        }

      break;
    }  /* END switch (rc) */

  pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
      ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
        JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;

  sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  reply_ack(preq);

  /* note in accounting file */
  account_record(PBS_ACCT_RERUN, pjob, NULL);

  return rc;
  }  /* END req_rerunjob() */
static void post_routejob(

  struct work_task *pwt)

  {
  int  newstate;
  int  newsub;
  int  r;
  int  stat = pwt->wt_aux;
  char *id = "post_routejob";
  job *jobp = (job *)pwt->wt_parm1;

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);
    }
  else
    {
    r = 2;

    sprintf(log_buffer, msg_badexit,
            stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  switch (r)
    {
    case 0:  /* normal return, job was routed */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(jobp);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
        remove_checkpoint(jobp);

      job_purge(jobp); /* need to remove server job struct */

      return;

      /*NOTREACHED*/

      break;

    case 1:  /* permanent rejection (or signal) */

      if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT)
        {
        /* job delete in progress, just set to queued status */

        svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT);

        return;
        }

      add_dest(jobp);  /* else mark destination as bad */

      /* fall through */

    default : /* try routing again */

      /* force re-eval of job state out of Transit */

      svr_evaljobstate(jobp, &newstate, &newsub, 1);
      svr_setjobstate(jobp, newstate, newsub);

      if ((r = job_route(jobp)) == PBSE_ROUTEREJ)
        job_abt(&jobp, pbse_to_txt(PBSE_ROUTEREJ));
      else if (r != 0)
        job_abt(&jobp, msg_routexceed);

      break;
    }  /* END switch (r) */

  return;
  }  /* END post_routejob() */
Exemple #26
0
static void process_hold_reply(

  struct work_task *pwt)
  {
  job       *pjob;

  struct batch_request *preq;
  int   newstate;
  int   newsub;
  attribute temphold;
  char *pset;
  int rc;

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = pwt->wt_parm1;
  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname)) == (job *)0)
    {
    LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else if (preq->rq_reply.brp_code != 0)
    {

    rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold);

    if (rc == 0)
      {
      rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold],
           &temphold, DECR);
      }

    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */

    pjob->ji_modified = 1;    /* indicate attributes changed */
    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub); /* saves job */

    if (preq->rq_reply.brp_code != PBSE_NOSUP)
      {
      sprintf(log_buffer, msg_mombadhold, preq->rq_reply.brp_code);
      LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid, log_buffer);
      req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer);
      }
    else
      {
      reply_ack(preq);
      }
    }
  else
    {
    /* record that MOM has a checkpoint file */

    /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
     * And if these flags are not set, start_exec will not try to run the job from
     * the checkpoint image file.
     */

    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;

    if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
      }

    pjob->ji_modified = 1;    /* indicate attributes changed     */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub); /* saves job */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
    reply_ack(preq);
    }
  }
void finish_moving_processing(

  job                  *pjob,
  struct batch_request *req,
  int                   status)

  {
  char         log_buf[LOCAL_LOG_BUF_SIZE];

  int          newstate;
  int          newsub;

  if (req->rq_type != PBS_BATCH_MoveJob)
    {
    sprintf(log_buf, "bad request type %d\n", req->rq_type);

    log_err(-1, __func__, log_buf);

    return;
    }

  if (pjob == NULL)
    return;

  switch (status)
    {
    case LOCUTION_SUCCESS:

      /* purge server's job structure */
      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(&pjob);

      if (pjob != NULL)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
          remove_checkpoint(&pjob);
        }

      snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob);
      snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager,
        req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host);

      if (pjob != NULL)
        svr_job_purge(pjob);
    
      reply_ack(req);

      break;
  
    default:

      status = PBSE_ROUTEREJ;

      if (pjob != NULL)
        {
        /* force re-eval of job state out of Transit */
        svr_evaljobstate(*pjob, newstate, newsub, 1);
        svr_setjobstate(pjob, newstate, newsub, FALSE);
   
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        }

      req_reject(status, 0, req, NULL, NULL);

      break;
    } /* END switch (status) */
  
  } /* END finish_moving_processing() */
Exemple #28
0
/**
 * @brief
 * 		post_movejob - clean up action for child started in net_move/send_job
 *		   to "move" a job to another server
 * @par
 * 		If move was successfull, delete server's copy of thejob structure,
 * 		and reply to request.
 * @par
 * 		If route didn't work, reject the request.
 *
 * @param[in]	pwt	-	work task structure
 *
 * @return	none.
 */
static void
post_movejob(struct work_task *pwt)
{
	char	*id = "post_movejob";
	struct batch_request *req;
	int	newstate;
	int	newsub;
	int	stat;
	int	r;
	job	*jobp;

	req  = (struct batch_request *)pwt->wt_parm1;
	stat = pwt->wt_aux;
	pbs_errno = PBSE_NONE;
	if (req->rq_type != PBS_BATCH_MoveJob) {
		sprintf(log_buffer, "bad request type %d", req->rq_type);
		log_err(-1, __func__, log_buffer);
		return;
	}

	jobp = find_job(req->rq_ind.rq_move.rq_jid);
	if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm2)) {
		sprintf(log_buffer,
			"job %s not found",
			req->rq_ind.rq_move.rq_jid);
		log_err(-1, __func__, log_buffer);

	}

	if (WIFEXITED(stat)) {
		r = WEXITSTATUS(stat);
		if (r == SEND_JOB_OK) {	/* purge server's job structure */
			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
				remove_stagein(jobp);
			(void)strcpy(log_buffer, msg_movejob);
			(void)sprintf(log_buffer+strlen(log_buffer),
				msg_manager,
				req->rq_ind.rq_move.rq_destin,
				req->rq_user, req->rq_host);
			/*
			 * If server is configured to keep job history info and
			 * the job is created here, then keep the job struture
			 * for history purpose without purging. No need to check
			 * for sub-jobs as sub jobs can't be moved.
			 */
			if (svr_chk_history_conf())
				svr_setjob_histinfo(jobp, T_MOV_JOB);
			else
				job_purge(jobp);
		} else
			r = PBSE_ROUTEREJ;
	} else {
		r = PBSE_SYSTEM;
		(void)sprintf(log_buffer, msg_badexit, stat);
		(void)strcat(log_buffer, __func__);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE,
			jobp->ji_qs.ji_jobid, log_buffer);
	}

	if (r) {
		if (jobp) {
			/* force re-eval of job state out of Transit */
			svr_evaljobstate(jobp, &newstate, &newsub, 1);
			svr_setjobstate(jobp, newstate, newsub);
		}
		req_reject(r, 0, req);
	} else
		reply_ack(req);

	return;
}
Exemple #29
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Exemple #30
0
void req_releasejob(

  struct batch_request *preq) /* ptr to the decoded request   */

  {
  int   newstate;
  int   newsub;
  long   old_hold;
  job  *pjob;
  char  *pset;
  int   rc;
  attribute      temphold;

  pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq);

  if (pjob == NULL)
    {
    return;
    }

  /* cannot do anything until we decode the holds to be set */

  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);
    return;
    }

  /* if other than HOLD_u is being released, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);
    return;
    }

  /* all ok so far, unset the hold */

  old_hold = pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long;

  if ((rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold], &temphold, DECR)))
    {
    req_reject(rc, 0, preq, NULL, NULL);
    return;
    }

  /* everything went well, if holds changed, update the job state */

  if (old_hold != pjob->ji_wattr[(int)JOB_ATR_hold].at_val.at_long)
    {
    pjob->ji_modified = 1; /* indicates attributes changed */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);

    svr_setjobstate(pjob, newstate, newsub); /* saves job */
    }

  sprintf(log_buffer, msg_jobholdrel,

          pset,
          preq->rq_user,
          preq->rq_host);

  LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
            log_buffer);

  reply_ack(preq);

  return;
  }  /* END req_releasejob() */