Exemplo n.º 1
0
static void reissue_to_svr(

  struct work_task *pwt)

  {
  time_t         time_now = time(NULL);
  char          *br_id = pwt->wt_parm1;
  batch_request *preq = get_remove_batch_request(br_id);

  /* if not timed-out, retry send to remote server */
  if (preq != NULL)
    {
    if (((time_now - preq->rq_time) > PBS_NET_RETRY_LIMIT) ||
        (issue_to_svr(preq->rq_host, preq, pwt->wt_parmfunc) != PBSE_NONE))
      {
      /* either timed-out or got hard error, tell post-function  */
      
      pwt->wt_aux = -1; /* seen as error by post function  */
      pwt->wt_event = -1; /* seen as connection by post func */
      
      if (pwt->wt_parmfunc != NULL)
        ((void (*)())pwt->wt_parmfunc)(pwt);
      }
    }

  free(pwt->wt_mutex);
  free(pwt);
  }  /* END reissue_to_svr() */
Exemplo n.º 2
0
static void post_message_req(

    struct work_task *pwt)

{
    struct batch_request *preq;
    char                  log_buf[LOCAL_LOG_BUF_SIZE];

    svr_disconnect(pwt->wt_event); /* close connection to MOM */

    preq = get_remove_batch_request(pwt->wt_parm1);

    free(pwt->wt_mutex);
    free(pwt);

    /* preq has been hadnled previously */
    if (preq == NULL)
        return;

    preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

    sprintf(log_buf, msg_messagejob, preq->rq_reply.brp_code);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_message.rq_jid, log_buf);

    if (preq->rq_reply.brp_code)
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    else
        reply_ack(preq);
} /* END post_message_req() */
Exemplo n.º 3
0
static void post_delete_route(

  struct work_task *pwt)

  {
  batch_request *preq = get_remove_batch_request((char *)pwt->wt_parm1);

  if (preq != NULL)
    req_deletejob(preq);

  free(pwt->wt_mutex);
  free(pwt);
  return;
  }
Exemplo n.º 4
0
void release_req(

  struct work_task *pwt)

  {
  batch_request *preq;
  char          *br_id = pwt->wt_parm1;

  if ((preq = get_remove_batch_request(br_id)) != NULL)
    free_br(preq);

  if (pwt->wt_event != -1)
    svr_disconnect(pwt->wt_event);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END release_req() */
Exemplo n.º 5
0
void reissue_to_svr(

    struct work_task *pwt)

{
    time_t         time_now = time(NULL);
    char          *br_id;
    batch_request *preq;
    char          *serverName = NULL;

    if (pwt == NULL)
        return;

    br_id = (char *)pwt->wt_parm1;
    preq = get_remove_batch_request(br_id);
    /* if not timed-out, retry send to remote server */
    if (preq != NULL)
    {
        if (preq->rq_host[0] != '\0')
            serverName = strdup(preq->rq_host);
        else
        {
            free(pwt->wt_mutex);
            free(pwt);
            return;
        }

        if (((time_now - preq->rq_time) > PBS_NET_RETRY_LIMIT) ||
                (issue_to_svr(serverName, &preq, pwt->wt_parmfunc) != PBSE_NONE))
        {
            /* either timed-out or got hard error, tell post-function  */

            pwt->wt_aux = -1; /* seen as error by post function  */
            pwt->wt_event = -1; /* seen as connection by post func */

            if (pwt->wt_parmfunc != NULL)
                (* pwt->wt_parmfunc)(pwt);
        }
    }

    if (serverName)
        free(serverName);

    free(pwt->wt_mutex);
    free(pwt);
}  /* END reissue_to_svr() */
Exemplo n.º 6
0
void chkpt_xfr_hold(

  struct work_task *ptask)

  {
  job                  *pjob;

  struct batch_request *preq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preq = get_remove_batch_request(ptask->wt_parm1);

  free(ptask->wt_mutex);
  free(ptask);

  if ((preq == NULL) ||
      (preq->rq_extra == NULL))
    return;

  if ((pjob = svr_find_job(preq->rq_extra, FALSE)) == NULL)
    return;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  free_br(preq);

  set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);

  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  return;
  }  /* END chkpt_xfr_hold() */
Exemplo n.º 7
0
static void process_checkpoint_reply(

  struct work_task *pwt)

  {
  job       *pjob;

  struct batch_request *preq;

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = get_remove_batch_request(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);

  /* preq handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_manager.rq_objname,
      msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else
    {
    /* record that MOM has a checkpoint file */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */
    reply_ack(preq);

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }
  } /* END process_checkpoint_reply() */
Exemplo n.º 8
0
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int                   delay = 0;
  int                   dellen = strlen(deldelaystr);
  job                  *pjob;

  pbs_queue            *pque;

  char                 *preq_clt_id;
  struct batch_request *preq_sig;         /* signal request to MOM */

  struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;
  preq_clt_id = preq_sig->rq_extra;

  free_br(preq_sig);

  if (preq_clt_id != NULL)
    {
    preq_clt = get_remove_batch_request(preq_clt_id);
    free(preq_clt_id);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE);

  if (pjob == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */
  if (delay == 0)
    {
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
      pthread_mutex_unlock(server.sv_attr_mutex);
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    else if (pjob != NULL)
      return;
    }

  set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE);

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */
  apply_job_delete_nanny(pjob, time_now + delay + 60);

  unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  }  /* END post_delete_mom1() */
Exemplo n.º 9
0
static void post_job_delete_nanny(

  struct work_task *pwt)

  {
  struct batch_request *preq_sig;  /* signal request to MOM */
  int                   rc;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  nanny = 0;

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)    
    return;

  rc       = preq_sig->rq_reply.brp_code;

  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    /* the admin disabled nanny within the last minute or so */
    free_br(preq_sig);

    return;
    }

  /* extract job id from task */
  pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE);

  if (pjob == NULL)
    {
    sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);
    }
  else if (rc == PBSE_UNKJOBID)
    {
    sprintf(log_buf, "job delete nanny returned, but does not exist on mom");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);

    free_nodes(pjob);

    set_resc_assigned(pjob, DECR);
  
    free_br(preq_sig);

    svr_job_purge(pjob);

    return;
    }
  
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  /* free task */
  free_br(preq_sig);

  return;
  } /* END post_job_delete_nanny() */
Exemplo n.º 10
0
void delay_and_send_sig_kill(
    
  batch_request *preq_sig)

  {
  int                   delay = 0;
  job                  *pjob;

  pbs_queue            *pque;

  batch_request        *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq_sig == NULL)
    return;

  rc = preq_sig->rq_reply.brp_code;

  if (preq_sig->rq_extend != NULL)
    {
    preq_clt = get_remove_batch_request(preq_sig->rq_extend);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL)
    {
    /* job has gone away, chk_job_request() calls req_reject() on failure */
    return;
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during rerun");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      pjob_mutex.unlock();
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  // Apply the user delay first so it takes precedence.
  if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
    delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
    mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

    if (delay == 0)
      {
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             0);
      }
    }
  else
    {
    /* why is the pque null. Something went wrong */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
    req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf);
    return;
    }

  pjob_mutex.unlock();
  reply_ack(preq_clt);
  set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE);
  } // END delay_and_send_sig_kill()
Exemplo n.º 11
0
void array_delete_wt(
    
  struct work_task *ptask)

  {
  struct batch_request *preq;
  job_array            *pa;

  int                   i;

  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  int                   num_jobs = 0;
  int                   num_prerun = 0;
  job                  *pjob;

  preq = get_remove_batch_request((char *)ptask->wt_parm1);
  
  free(ptask->wt_mutex);
  free(ptask);

  if (preq == NULL)
    return;

  pa = get_array(preq->rq_ind.rq_delete.rq_objname);

  if (pa == NULL)
    {
    /* jobs must have exited already */
    reply_ack(preq);

    return;
    }

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;
    
    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      num_jobs++;
      
      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
        {
        num_prerun++;
        /* mom still hasn't gotten job?? delete anyway */
        
        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
          {
          /* job has restart file at mom, do end job processing */
          change_restart_comment_if_needed(pjob);
          
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);
          
          pjob->ji_momhandle = -1;
          
          /* force new connection */
          if (LOGLEVEL >= 7)
            {
            sprintf(log_buf, "calling on_job_exit from %s", __func__);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
            }
          set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
          
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }
        }
      else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
        {
        /* job has staged-in file, should remove them */
        remove_stagein(&pjob);
        
        if (pjob != NULL)
          {
          /* job_abt() calls svr_job_purge which will try to lock the array again */
          pthread_mutex_unlock(pa->ai_mutex);
          job_abt(&pjob, NULL);
          pthread_mutex_lock(pa->ai_mutex);
          }
        }
      else
        {
        /* job_abt() calls svr_job_purge which will try to lock the array again */
        pthread_mutex_unlock(pa->ai_mutex);
        job_abt(&pjob, NULL);
        pthread_mutex_lock(pa->ai_mutex);
        }
      } /* END if (ji_substate == JOB_SUBSTATE_PRERUN) */
    } /* END for each job in array */
  
  pthread_mutex_unlock(pa->ai_mutex);
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  
  if (num_jobs == num_prerun)
    {
    reply_ack(preq);
    }
  else
    {
    req_deletearray(preq);
    }

  } /* END array_delete_wt() */
Exemplo n.º 12
0
static void post_modify_req(

  struct work_task *pwt)

  {

  struct batch_request *preq;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  svr_disconnect(pwt->wt_event);  /* close connection to MOM */

  preq = get_remove_batch_request(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);

  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

  if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID))
    {
    sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code);

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_modify.rq_objname,
      log_buf);

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    }
  else
    {
    if (preq->rq_reply.brp_code == PBSE_UNKJOBID)
      {
      if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL)
        {
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
        return;
        }
      else
        {
        if (LOGLEVEL >= 0)
          {
          sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s",
            (pjob->ji_qs.ji_jobid != NULL) ? pjob->ji_qs.ji_jobid : "",
            PJobState[pjob->ji_qs.ji_state],
            PJobSubState[pjob->ji_qs.ji_substate],
            pjob->ji_qs.ji_destin);

          log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
          }
        
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }
      }

    reply_ack(preq);
    }

  return;
  }  /* END post_modify_req() */
Exemplo n.º 13
0
void delay_and_send_sig_kill(batch_request *preq_sig)
  {
  int                   delay = 0;
  job                  *pjob;

  pbs_queue            *pque;

    struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;

  if (preq_sig->rq_extend != NULL)
    {
    preq_clt = get_remove_batch_request(preq_sig->rq_extend);
    }
  free_br(preq_sig);

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during rerun");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
    pthread_mutex_lock(server.sv_attr_mutex);
    delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                           &server.sv_attr[SRV_ATR_KillDelay],
                           0);
    pthread_mutex_unlock(server.sv_attr_mutex);
    }
  else if (pjob == NULL)
    {
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return;
    }

  unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
  reply_ack(preq_clt);
  set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE);
  }
Exemplo n.º 14
0
static void process_hold_reply(

  struct work_task *pwt)

  {
  job                  *pjob;
  pbs_attribute         temphold;

  struct batch_request *preq;
  int                   newstate;
  int                   newsub;
  int                   rc;
  char                 *pset;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = get_remove_batch_request(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);

  /* preq was handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);

    return;
    }
  else if (preq->rq_reply.brp_code != 0)
    {

    rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold);

    if (rc == 0)
      {
      rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold],
           &temphold, DECR);
      }

    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */

    pjob->ji_modified = 1;    /* indicate attributes changed */
    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */

    if (preq->rq_reply.brp_code != PBSE_NOSUP)
      {
      sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code);
      log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf);
      }
    else
      {
      reply_ack(preq);
      }
    }
  else
    {
    /* record that MOM has a checkpoint file */

    /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
     * And if these flags are not set, start_exec will not try to run the job from
     * the checkpoint image file.
     */

    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;

    if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
      }

    pjob->ji_modified = 1;    /* indicate attributes changed     */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
    reply_ack(preq);
    }

  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  } /* END process_hold_reply() */