static int svr_send_checkpoint(

  job                  *pjob,     /* I */
  struct batch_request *preq,     /* I */
  int                   state,    /* I */
  int                   substate) /* I */

  {

  struct batch_request *momreq = 0;
  int        rc;

  momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_IN);

  if (momreq == NULL)
    {
    /* no files to send, go directly to sending job to mom */

    return(svr_strtjob2(pjob, preq));
    }

  /* save job id for post_checkpointsend */

  momreq->rq_extra = malloc(PBS_MAXSVRJOBID + 1);

  if (momreq->rq_extra == 0)
    {
    return(PBSE_SYSTEM);
    }

  strcpy(momreq->rq_extra, pjob->ji_qs.ji_jobid);

  rc = relay_to_mom(
         pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
         momreq,
         post_checkpointsend);

  if (rc == 0)
    {
    svr_setjobstate(pjob, state, substate);

    /*
     * checkpoint copy started ok - reply to client as copy may
     * take too long to wait.
     */

    if (preq != NULL)
      reply_ack(preq);
    }
  else
    {
    free(momreq->rq_extra);
    }

  return(rc);
  }  /* END svr_send_checkpoint() */
Exemple #2
0
END_TEST

START_TEST(cpy_checkpoint_test)
  {
  struct job *test_job = job_alloc();
  struct batch_request *result = cpy_checkpoint(NULL,
                                         test_job,
                                         JOB_ATR_checkpoint_name,
                                         CKPT_DIR_IN);
  struct batch_request *initial = alloc_br(/*PBS_BATCH_CheckpointJob*/0);
  fail_unless(result == NULL, "NULL batch_request input fail");

  result = cpy_checkpoint(initial,
                          NULL,
                          JOB_ATR_checkpoint_name,
                          CKPT_DIR_IN);
  fail_unless(result == NULL, "NULL job input fail");

  /*TODO: add test for valid input, invalid dir value*/
  }
Exemple #3
0
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */