Пример #1
0
void *req_messagejob(
    
  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
    return(NULL);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    
    return(NULL);
    }

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
  /* pass the request on to MOM */
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
    {
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    free_br(dup_req);
    }
  else
    {
    post_message_req(dup_req);
    free_br(preq);
    }

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);

  return(NULL);
  } /* END req_messagejob() */
Пример #2
0
void *req_messagejob(

    void *vp)

{
    struct batch_request *preq = (struct batch_request *)vp;
    job                  *pjob;
    int                   rc;
    struct batch_request *dup_req = NULL;

    if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
        return(NULL);

    /* the job must be running */

    if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
        req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return(NULL);
    }

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
        req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
    /* pass the request on to MOM */
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0)
        req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    else
        free_br(preq);

    /* After MOM acts and replies to us, we pick up in post_message_req() */
    if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    return(NULL);
} /* END req_messagejob() */
Пример #3
0
/*
 * modify_whole_array()
 * modifies the entire job array 
 * @SEE req_modify_array PARENT
 */ 
int modify_whole_array(

  job_array *pa,              /* I/O */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  int   i;
  int   rc = 0;
  int   mom_relay = 0;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);

      if (rc == PBSE_RELAYED_TO_MOM)
        {
        struct batch_request *array_req = NULL;
        /* We told modify_job not to call relay_to_mom
         * so we need to contact the mom */
        rc = copy_batchrequest(&array_req, preq, 0, i);
        if (rc != 0)
          {
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          return(rc);
          }

        preq->rq_refcount++;
        if (mom_relay == 0)
          {
          preq->rq_refcount++;
          }
        mom_relay++;
        /* The array_req is freed in relay_to_mom (failure)
         * or in issue_Drequest (success) */
        if ((rc = relay_to_mom(&pjob, array_req, post_modify_arrayreq)))
          {
          if (pjob != NULL)
            {
            snprintf(log_buf,sizeof(log_buf),
              "Unable to relay information to mom for job '%s'\n",
              pjob->ji_qs.ji_jobid);
            log_err(rc, __func__, log_buf);
            unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }

          return(rc); /* unable to get to MOM */
          }
        }

      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
      }
    } /* END foreach job in array */

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(rc);
  } /* END modify_whole_array() */
Пример #4
0
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */
Пример #5
0
int req_signaljob(

  void *vp)  /* I */

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);
    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(rc, 0, preq, NULL, "can not allocate memory");
    unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
Пример #6
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char id[] = "modify_array_range";
  tlist_head tl;
  int i, rc;
  int mom_relay = 0;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->jobs[i] == NULL))
          continue;
        
        rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY);

        if (rc == PBSE_RELAYED_TO_MOM)
          {
          struct batch_request *array_req = NULL;
          
          /* We told modify_job not to call relay_to_mom so we need to contact the mom */
          rc = copy_batchrequest(&array_req, preq, 0, i);
          if (rc != 0)
            {
            return(rc);
            }
          
          preq->rq_refcount++;
          if (mom_relay == 0)
            {
            preq->rq_refcount++;
            }
          mom_relay++;
          if ((rc = relay_to_mom(
                      pa->jobs[i],
                      array_req,
                      post_modify_arrayreq)))
            {  
            snprintf(log_buffer,sizeof(log_buffer),
              "Unable to relay information to mom for job '%s'\n",
              pa->jobs[i]->ji_qs.ji_jobid);
            log_err(rc,id,log_buffer);
          
            return(rc); /* unable to get to MOM */
            }
        
          }  
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */
Пример #7
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  tlist_head          tl;
  int                 i;
  int                 rc;
  int                 mom_relay = 0;
  job                *pjob;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->job_ids[i] == NULL))
          continue;

        if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          pthread_mutex_unlock(pa->ai_mutex);
          rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);
          pa = get_jobs_array(&pjob);
          
          if (pjob != NULL)
            {
            if (rc == PBSE_RELAYED_TO_MOM)
              {
              struct batch_request *array_req = NULL;
              
              /* We told modify_job not to call relay_to_mom so we need to contact the mom */
              if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE)
                {
                return(rc);
                }
              
              preq->rq_refcount++;
              if (mom_relay == 0)
                {
                preq->rq_refcount++;
                }
              mom_relay++;
              
              /* The array_req is freed in relay_to_mom (failure)
               * or in issue_Drequest (success) */
              
              if ((rc = relay_to_mom(&pjob, array_req, NULL)))
                {
                snprintf(log_buf,sizeof(log_buf),
                  "Unable to relay information to mom for job '%s'\n",
                  pjob->ji_qs.ji_jobid);
                log_err(rc, __func__, log_buf);
                
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                
                return(rc); /* unable to get to MOM */
                }
              else
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                post_modify_arrayreq(array_req);
                }
              }
            else
              unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }
          else
            pa->job_ids[i] = NULL;

          }
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */
Пример #8
0
void *req_checkpointjob(

  void *vp)

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  pbs_attribute        *pattr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
      {
      req_reject(rc, 0, preq, NULL, "failure to allocate memory");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
        pjob = NULL;
        }

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  if (pjob != NULL)
    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

  return(NULL);
  }  /* END req_checkpointjob() */
Пример #9
0
int req_holdjob(

  void *vp) /* I */

  {
  long                 *hold_val;
  int                   newstate;
  int                   newsub;
  long                  old_hold;
  job                  *pjob;
  char                 *pset;
  int                   rc;
  pbs_attribute         temphold;
  pbs_attribute        *pattr;
  struct batch_request *preq = (struct batch_request *)vp;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  /* cannot do anything until we decode the holds to be set */

  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);
        pjob = NULL;
        req_reject(rc, 0, preq, NULL, "relay to mom failed");
        }

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */

      pjob->ji_modified = 1;

      svr_evaljobstate(pjob, &newstate, &newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  if (pjob != NULL)
    unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);

  return(PBSE_NONE);
  }  /* END req_holdjob() */
Пример #10
0
/*
 * modify_whole_array()
 * modifies the entire job array 
 * @SEE req_modify_array PARENT
 */ 
int modify_whole_array(

  job_array *pa,              /* I/O */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char id[] = "modify_whole_array";
  int i;
  int rc = 0;
  int mom_relay = 0;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->jobs[i] == NULL)
      continue;

    /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
    rc = modify_job(pa->jobs[i],plist,preq,checkpoint_req, NO_MOM_RELAY);

    if(rc == PBSE_RELAYED_TO_MOM)
      {
      struct batch_request *array_req = NULL;

      /* We told modify_job not to call relay_to_mom so we need to contact the mom */
      rc = copy_batchrequest(&array_req, preq, 0, i);
      if(rc != 0)
        {
        return(rc);
        }

      preq->rq_refcount++;
      if(mom_relay == 0)
        {
        preq->rq_refcount++;
        }
      mom_relay++;
      if ((rc = relay_to_mom(
                  pa->jobs[i]->ji_qs.ji_un.ji_exect.ji_momaddr,
                  array_req,
                  post_modify_arrayreq)))
        {  
        snprintf(log_buffer,sizeof(log_buffer),
          "Unable to relay information to mom for job '%s'\n",
          pa->jobs[i]->ji_qs.ji_jobid);
        log_err(rc,id,log_buffer);

        return(rc); /* unable to get to MOM */
        }

      }
    }

  if(mom_relay)
    {
    preq->rq_refcount--;
    if(preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(rc);
  } /* END modify_whole_array() */