Esempio n. 1
0
/*
 * send_sig_kill
 *
 * The SIGTERM has been sent and we've waited for the kill_delay so now send the SIGKILL.
 * @pre-cond: pwt must point to a valid task
 * @pre-cond: pwt->wt_parm1 must point to a valid character string
 *
 */
void send_sig_kill(
    
  struct work_task *pwt)

  {
  job                  *pjob;
  char                 *job_id = (char *)pwt->wt_parm1;
  static const char    *rerun = "rerun";

  free(pwt->wt_mutex);
  free(pwt);

  if (job_id == NULL)
    return;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    {
    free(job_id);
    return;
    }
  
  char *extra = strdup(rerun);

  free(job_id);

  if (issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL) == 0)
    {
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    }

  unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
  } /* END send_sig_kill() */
Esempio n. 2
0
static void post_delete_mom2(

  struct work_task *pwt)

  {
  job  *pjob;
  char *sigk = "SIGKILL";

  pjob = (job *)pwt->wt_parm1;

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    issue_signal(pjob, sigk, release_req, 0);

    sprintf(log_buffer, msg_delrunjobsig, sigk);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }

  return;
  }  /* END post_delete_mom2() */
Esempio n. 3
0
static void job_delete_nanny(

  struct work_task *pwt)

  {
  job                  *pjob;
  char                 *sigk = "SIGKILL";
  char                 *jobid;

  struct batch_request *newreq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  time_t                time_now = time(NULL);
  long                  nanny = FALSE;

  /* short-circuit if nanny isn't enabled */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    jobid = (char *)pwt->wt_parm1;
    
    if (jobid != NULL)
      {
      pjob = svr_find_job(jobid, FALSE);
      
      if (pjob != NULL)
        {
        sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid);
        log_err(-1, "job nanny", log_buf);
        
        /* build up a Signal Job batch request */
        if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
          {
          strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
          snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk);
          }
        
        issue_signal(&pjob, sigk, post_job_delete_nanny, newreq);
        
        if (pjob != NULL)
          {
          apply_job_delete_nanny(pjob, time_now + 60);
  
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }
        }
      }
    else
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory");
      }
    }
  
  if (pwt->wt_parm1 != NULL)
    free(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END job_delete_nanny() */
Esempio n. 4
0
static void rerun_or_kill(

  job  *pjob,  /* I (modified/freed) */
  char *text)  /* I */

  {
  long server_state = server.sv_attr[(int)SRV_ATR_State].at_val.at_long;

  if (pjob->ji_wattr[(int)JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(pjob, "SIGKILL", release_req, 0);

    pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;

    strcpy(log_buffer, msg_init_queued);
    strcat(log_buffer, pjob->ji_qhdr->qu_qs.qu_name);
    strcat(log_buffer, text);
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */

    strcpy(log_buffer, msg_job_abort);
    strcat(log_buffer, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    job_abt(&pjob, log_buffer);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */

    strcpy(log_buffer, msg_leftrunning);
    strcat(log_buffer, text);
    }

  log_event(PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,

            PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
            log_buffer);

  return;
  }  /* END rerun_or_kill() */
Esempio n. 5
0
void post_delete_mom2(

  struct work_task *pwt)

  {
  char        *jobid;
  const char *sigk = "SIGKILL";
  char         log_buf[LOCAL_LOG_BUF_SIZE];
  job         *pjob;

  jobid = (char *)pwt->wt_parm1;
  free(pwt->wt_mutex);
  free(pwt);
  
  if (jobid == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return;
    }

  pjob = svr_find_job(jobid, FALSE);
  free(jobid);

  if (pjob != NULL)
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      issue_signal(&pjob, sigk, free_br, NULL);
      
      if (pjob != NULL)
        {
        sprintf(log_buf, msg_delrunjobsig, sigk);
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      }
    
    if (pjob == NULL)
      job_mutex.set_lock_on_exit(false);
    }
  }  /* END post_delete_mom2() */
Esempio n. 6
0
static void job_delete_nanny(

  struct work_task *pwt)

  {
  job *pjob;
  char *sigk = "SIGKILL";

  struct batch_request *newreq;

  /* short-circuit if nanny isn't enabled */

  if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long)
    {
    release_req(pwt);

    return;
    }

  pjob = (job *)pwt->wt_parm1;

  sprintf(log_buffer, "exiting job '%s' still exists, sending a SIGKILL",
          pjob->ji_qs.ji_jobid);

  log_err(-1, "job nanny", log_buffer);

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
    {
    strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
    strncpy(newreq->rq_ind.rq_signal.rq_signame, sigk, PBS_SIGNAMESZ);
    }

  issue_signal(pjob, sigk, post_job_delete_nanny, newreq);


  apply_job_delete_nanny(pjob, time_now + 60);

  return;
  } /* END job_delete_nanny() */
Esempio n. 7
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Esempio n. 8
0
int req_rerunjob(
   
  batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */
  if (!strcasecmp(preq->rq_ind.rq_rerun, "all"))
    {
    return(handle_requeue_all(preq));
    }
  
  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
    {
    // If we are already queued, then there is nothing to do.
    rc = PBSE_NONE;
    reply_ack(preq);
    return(rc);
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    int                 delay = 0;
    pbs_queue          *pque;
  
    // Apply the user delay first so it takes precedence.
    if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
      delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
      mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

      if (delay == 0)
        {
        delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                               &server.sv_attr[SRV_ATR_KillDelay],
                               0);
        }
      }
    else
      {
      /* why is the pque null. Something went wrong */
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
      req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf);
      return(PBSE_UNKQUE);
      }
    
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

    if (delay != 0)
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      get_batch_request_id(preq);
      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if ((preq->rq_extend) && 
          (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))))
        {
        std::string extend = RERUNFORCE;
        batch_request *dup = new batch_request(*preq);
        get_batch_request_id(dup);
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str()));

        if (rc == PBSE_NORELYMOM)
          {
          dup->rq_reply.brp_code = PBSE_NORELYMOM;
          pjob_mutex.unlock();
          post_rerun(dup);

          pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE);
          if (pjob == NULL)
            {
            delete dup;
            return(PBSE_NONE);
            }

          pjob_mutex.set_lock_state(true);
          rc = PBSE_NONE;
          }

        delete dup;
        }
      else
        {
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str()));
        if (rc != PBSE_NONE)
          {
          /* cant send to MOM */
          req_reject(rc, 0, preq, NULL, NULL);
          }

        return(rc);
        }
      }
    else
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
        {
        std::string extend = RERUNFORCE;
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str()));
        if (rc == PBSE_NORELYMOM)
          rc = PBSE_NONE;
        }
      else
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL);
      }
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  /* finalize_rerunjob will return with pjob->ji_mutex unlocked */
  pjob_mutex.set_unlock_on_exit(false);
  return finalize_rerunjob(preq,pjob,rc);
  }
/**
 * attempt_delete()
 * deletes a job differently depending on the job's state
 *
 * @return TRUE if the job was deleted, FALSE if skipped
 * @param pjob - a pointer to the job being handled
 */
int attempt_delete(

  void *j) /* I */

  {
  int skipped = FALSE;
  struct work_task *pwtold;
  struct work_task *pwtnew;
  job *pjob;

  /* job considered deleted if null */
  if (j == NULL)
    return(TRUE);

  pjob = (job *)j;

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */
    
    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);
    
    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        kill((pid_t)pwtold->wt_event, SIGTERM);
        
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;
        }
      
      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    skipped = TRUE;
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */

  else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
    {
    /* we'll wait for the mom to get this job, then delete it */
    skipped = TRUE;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* set up nanny */
    
    if (!has_job_delete_nanny(pjob))
      {
      apply_job_delete_nanny(pjob, time_now + 60);
      
      /* need to issue a signal to the mom, but we don't want to sent an ack to the
       * client when the mom replies */
      issue_signal(pjob, "SIGTERM", post_delete, NULL);
      }

    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
      {
      /* job has restart file at mom, change restart comment if failed */
      change_restart_comment_if_needed(pjob);
      }
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */    
    change_restart_comment_if_needed(pjob);
    
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;
    
    /* force new connection */
    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);
    
    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
   
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    
    remove_stagein(pjob);
    
    job_abt(&pjob, NULL);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);
    
    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }
    
    KeepSeconds = attr_ifelse_long(
        &pque->qu_attr[(int)QE_ATR_KeepCompleted],
        &server.sv_attr[(int)SRV_ATR_KeepCompleted],
        0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);
    
    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }

  return(!skipped);
  } /* END attempt_delete() */
Esempio n. 10
0
/**
 * attempt_delete()
 * deletes a job differently depending on the job's state
 *
 * @return TRUE if the job was deleted, FALSE if skipped
 * @param pjob - a pointer to the job being handled
 */
int attempt_delete(

  void *j) /* I */

  {
  int        skipped = FALSE;
  int        release_mutex = TRUE;

  job       *pjob;
  time_t     time_now = time(NULL);
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  /* job considered deleted if null */
  if (j == NULL)
    return(TRUE);

  pjob = (job *)j;

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* I'm not sure if this is still possible since the thread
     * waits on the job to finish transmiting, but I'll leave
     * this part here --dbeer */
    skipped = TRUE;
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */

  else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
    {
    /* we'll wait for the mom to get this job, then delete it */
    skipped = TRUE;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* set up nanny */
    
    if (pjob->ji_has_delete_nanny == FALSE)
      {
      apply_job_delete_nanny(pjob, time_now + 60);
      
      /* need to issue a signal to the mom, but we don't want to sent an ack to the
       * client when the mom replies */
      issue_signal(&pjob, "SIGTERM", post_delete, NULL);
      }

    if (pjob != NULL)
      {
      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
        {
        /* job has restart file at mom, change restart comment if failed */
        change_restart_comment_if_needed(pjob);
        }

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */    
    change_restart_comment_if_needed(pjob);
    
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    pjob->ji_momhandle = -1;
    
    /* force new connection */
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    
    remove_stagein(&pjob);
    
    if (pjob != NULL)
      job_abt(&pjob, NULL);

    release_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);
    
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    
    if (pjob != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
        &pque->qu_attr[QE_ATR_KeepCompleted],
        &server.sv_attr[SRV_ATR_KeepCompleted],
        0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      release_mutex = FALSE;
    }

  if (release_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

  return(!skipped);
  } /* END attempt_delete() */
Esempio n. 11
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Esempio n. 12
0
void req_deletejob(

  struct batch_request *preq)  /* I */

  {
  job              *pjob;

  struct work_task *pwtold;

  struct work_task *pwtnew;
  struct work_task *pwtcheck;

  int               rc;
  char             *sigt = "SIGTERM";

  char             *Msg = NULL;

  /* check if we are getting a purgecomplete from scheduler */
  if ((preq->rq_extend != NULL) && 
        !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP)))
    {

    /*
     * purge_completed_jobs will respond with either an ack or reject
     */
    purge_completed_jobs(preq);

    return;
    }

  /* The way this is implemented, if the user enters the command "qdel -p <jobid>",
   * they can then delete jobs other than their own since the authorization
   * checks are made below in chk_job_request. This should probably be fixed.
   */

  if (forced_jobpurge(preq) != 0)
    {
    return;
    }

  /* NOTE:  should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */

  /* NYI */

  pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq);

  if (pjob == NULL)
    {
    /* NOTE:  chk_job_request() will issue req_reject() */

    return;
    }

  if (preq->rq_extend != NULL)
    {
    if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) &&
        strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) &&
        strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      /* have text message in request extension, add it */

      Msg = preq->rq_extend;

      /*
       * Message capability is only for operators and managers.
       * Check if request is authorized
      */

      if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
                            ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
        {
        req_reject(PBSE_PERM, 0, preq, NULL,
                   "must have operator or manager privilege to use -m parameter");
        return;
        }
      }
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */

    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);

    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        pwtnew = set_task(
                   pwtold->wt_type,
                   pwtold->wt_event,
                   post_delete_route,
                   preq);

        if (pwtnew != NULL)
          {
          /*
           * reset type in case the SIGCHLD came
           * in during the set_task;  it makes
           * sure that next_task() will find the
           * new entry.
           */

          pwtnew->wt_type = pwtold->wt_type;
          pwtnew->wt_aux = pwtold->wt_aux;

          kill((pid_t)pwtold->wt_event, SIGTERM);

          pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;

          return; /* all done for now */
          }
        else
          {
          req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

          return;
          }
        }

      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    /* should never get here ...  */

    log_err(-1, "req_delete", "Did not find work task for router");

    req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL);

    return;
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    pwtnew = set_task(
               WORK_Timed,
               time_now + 1,
               post_delete_route,
               preq);

    if (pwtnew == 0)
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buffer, "requestor=%s@%s",
          preq->rq_user,
          preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */

  account_record(PBS_ACCT_DEL, pjob, log_buffer);

  sprintf(log_buffer, msg_manager,
          msg_deletejob,
          preq->rq_user,
          preq->rq_host);

  log_event(
    PBSEVENT_JOB,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    log_buffer);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */

    strcat(log_buffer, "\n");
    strcat(log_buffer, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      !has_job_delete_nanny(pjob))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (has_job_delete_nanny(pjob))
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return;
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /* check if we are getting a asynchronous delete */

    if ((preq->rq_extend != NULL) &&
          !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC)))
      {
      struct batch_request *preq_tmp = NULL;
      /*
       * Respond with an ack now instead of after MOM processing
       * Create a new batch request and fill it in. It will be freed by reply_ack
       */

      snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously");
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer);

      preq_tmp = alloc_br(PBS_BATCH_DeleteJob);
      preq_tmp->rq_perm = preq->rq_perm;
      preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd;
      preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype;
      preq_tmp->rq_fromsvr = preq->rq_fromsvr;
      preq_tmp->rq_extsz = preq->rq_extsz;
      preq_tmp->rq_conn = preq->rq_conn;
      memcpy(preq_tmp->rq_ind.rq_manager.rq_objname,
          preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1);
      memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1);
      memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1);

      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      }
  
    /* make a cleanup task if set */
    if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
      {
      pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
      if (pwtcheck != NULL)
        append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
      }

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */

    if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq)))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */

    sprintf(log_buffer, msg_delrunjobsig,
            sigt);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    return;
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
      (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
    {
    pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
    if (pwtcheck != NULL)
      append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = pjob->ji_arraystruct;

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->jobs[i] == NULL)
          continue;

        tmp = (job *)pa->jobs[i];

        if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
          {
          tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
              
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
            }
          
          svr_evaljobstate(tmp, &newstate, &newsub, 1);
          svr_setjobstate(tmp, newstate, newsub);
          job_save(tmp, SAVEJOB_FULL, 0);

          break;
          }
        }
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;

    /* force new connection */

    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);

    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(pjob);

    job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);

    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }

    KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);

    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  reply_ack(preq);

  return;
  }  /* END req_deletejob() */
Esempio n. 13
0
/**
 * @brief
 * 		req_rerunjob - service the Rerun Job Request
 *
 *  @param[in,out]	preq	-	Job Request
 *  @param[in,out]	pjob	-	ptr to the subjob
 */
static void
req_rerunjob2(struct batch_request *preq, job *pjob)
{
	long	force = 0;
	struct  work_task *ptask;
	time_t  rerun_to;
	int	conn_idx;
	int rc;
	int is_mgr = 0;
	void *force_rerun = (void *)0;

	if (preq->rq_extend && (strcmp(preq->rq_extend, "force") == 0))
		force = 1;

	/* See if the request is coming from a manager */
	if (preq->rq_perm & (ATR_DFLAG_MGRD | ATR_DFLAG_MGWR))
		is_mgr = 1;

	/* the job must be rerunnable or force must be on */

	if ((pjob->ji_wattr[(int)JOB_ATR_rerunable].at_val.at_long == 0) &&
		(force == 0)) {
		req_reject(PBSE_NORERUN, 0, preq);
		return;
	}

	/* the job must be running */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}
	if ((pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) && (force==0)) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	/* Set the flag for post_rerun only when
	 * force is set and request is from manager */
	if (force == 1 && is_mgr == 1)
		force_rerun = (void *)1;

	/* ask MOM to kill off the job */

	rc = issue_signal(pjob, SIG_RERUN, post_rerun, force_rerun);

	/*
	 * If force is set and request is from a PBS manager,
	 * job is re-queued regardless of issue_signal to MoM
	 * was a success or failure.
	 * Eventually, when the mom updates server about the job,
	 * server sends a discard message to mom and job is
	 * then deleted from mom as well.
	 */
	if ((rc || is_mgr) && force == 1) {

		/* Mom is down and issue signal failed or
		 * request is from a manager and "force" is on,
		 * force the requeue */

		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN3;
		discard_job(pjob, "Force rerun", 1);
		force_reque(pjob);
		reply_ack(preq);
		return;

	}

	if (rc != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* So job has run and is to be rerun (not restarted) */

	pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
		~(JOB_SVFLG_CHKPT | JOB_SVFLG_ChkptMig)) |
	JOB_SVFLG_HASRUN;
	svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_RERUN);

	(void)sprintf(log_buffer, msg_manager, msg_jobrerun,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* The following means we've detected an outstanding rerun request  */
	/* for the same job which should not happen. But if it does, let's  */
	/* ack that previous request to also free up its request structure. */
	if (pjob->ji_rerun_preq != NULL) {
		reply_ack(pjob->ji_rerun_preq);
	}
	pjob->ji_rerun_preq = preq;

	/* put a timeout on the rerun request so that it doesn't hang 	*/
	/* indefinitely; if it does, the scheduler would also hang on a */
	/* requeue request  */
	time_now = time((time_t *)0);
	if ((server.sv_attr[(int)SRV_ATR_JobRequeTimeout].at_flags & ATR_VFLAG_SET) == 0) {
		rerun_to = time_now + PBS_DIS_TCP_TIMEOUT_RERUN;
	} else {
		rerun_to = time_now + server.sv_attr[(int)SRV_ATR_JobRequeTimeout].at_val.at_long;
	}
	ptask = set_task(WORK_Timed, rerun_to, timeout_rerun_request, pjob);
	if (ptask) {
		/* this ensures that the ptask created gets cleared in case */
		/* pjob gets deleted before the task is served */
		append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
	}

	/* set no-timeout flag on connection to client */
	if (preq->rq_conn != PBS_LOCAL_CONNECTION) {
		conn_idx = connection_find_actual_index(preq->rq_conn);
		if (conn_idx != -1)
			svr_conn[conn_idx].cn_authen |= PBS_NET_CONN_NOTIMEOUT;
	}

}
Esempio n. 14
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    int                 delay = 0;
    pbs_queue          *pque;

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else if (pjob == NULL)
      {
      rc = PBSE_NORERUN;
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      return rc;
      }
    if(delay != 0)
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      get_batch_request_id(preq);
      if ((rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id))))
        {
        /* cant send to MOM */
        req_reject(rc, 0, preq, NULL, NULL);
        }
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      return rc;
      }
    else
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL);
      }
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }
  return finalize_rerunjob(preq,pjob,rc);
  }
Esempio n. 15
0
void rerun_or_kill(

  job  **pjob_ptr, /* I (modified/freed) */
  char  *text)     /* I */

  {
  long       server_state = SV_STATE_DOWN;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue *pque;
  job       *pjob = *pjob_ptr;

  get_svr_attr_l(SRV_ATR_State, &server_state);
  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(&pjob, "SIGKILL", free_br, NULL);

    if (pjob != NULL)
      {
      pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
      if ((pque = get_jobs_queue(&pjob)) != NULL)
        {
        snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text);

        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        }
      }
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);

    job_abt(pjob_ptr, log_buf);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text);
    }

  if (pjob != NULL)
    {
    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
    }

  return;
  }  /* END rerun_or_kill() */