Example #1
0
/*
 * get_job_script_path()
 *
 * @pre-cond: parameters must be valid
 * @post-cond: script_path will be populated with the path to the job's script on success
 * @return: PBSE_NONE on success, PBSE_JOB_RECYCLED if job disappears or -1 if array can't be 
 * found for an array job.
 */
int get_job_script_path(

  job         *pjob,
  std::string &script_path)

  {
  // get the adjusted path to the script
  script_path = get_path_jobdata(pjob->ji_qs.ji_jobid, path_jobs);

  if (pjob->ji_arraystructid[0] != '\0')
    {
    job_array *pa = get_jobs_array(&pjob);

    if (pa != NULL)
      {
      script_path += pa->ai_qs.fileprefix;
      unlock_ai_mutex(pa, __func__, NULL, LOGLEVEL);
      }
    else if (pjob == NULL)
      return(PBSE_JOB_RECYCLED);
    else
      return(-1);
    }
  else
    {
    script_path += pjob->ji_qs.ji_fileprefix;
    }
    
  script_path += JOB_SCRIPT_SUFFIX;
  return(PBSE_NONE);
  } /* END get_job_script_path() */
Example #2
0
END_TEST

START_TEST(get_jobs_array_test)
  {
  struct job *test_job = NULL;

  struct job_array *result = get_jobs_array(NULL);
  fail_unless(result == NULL, "NULL input pointer to pointer to job fail");

  result = get_jobs_array(&test_job);
  fail_unless(result == NULL, "NULL input pointer to job fail");

  test_job = job_alloc();
  result = get_jobs_array(&test_job);
  fail_unless(result == NULL, "get job array fail");
  }
Example #3
0
int modify_whole_array(

  job_array            *pa,             /* I/O */
  svrattrl             *plist,          /* I */
  struct batch_request *preq,           /* I */
  int                   checkpoint_req) /* I */

  {
  int   i;
  int   rc = PBSE_NONE;
  int   modify_job_rc = PBSE_NONE;
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      batch_request *array_req = duplicate_request(preq, i);
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pthread_mutex_unlock(pa->ai_mutex);
      array_req->rq_noreply = TRUE;
      rc = modify_job((void **)&pjob, plist, array_req, checkpoint_req, NO_MOM_RELAY);
      if (rc != PBSE_NONE)
        {
        modify_job_rc = rc;
        }
      pa = get_jobs_array(&pjob);
      
      if (pa == NULL)
        {
        if (pjob == NULL)
          job_mutex.set_lock_on_exit(false);

        return(PBSE_JOB_RECYCLED);
        }

      if (pjob == NULL)
        {
        pa->job_ids[i] = NULL;
        job_mutex.set_lock_on_exit(false);
        continue;
        }
      }
    } /* END foreach job in array */

  return(modify_job_rc);
  } /* END modify_whole_array() */
Example #4
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Example #5
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  tlist_head          tl;
  int                 i;
  int                 rc;
  int                 mom_relay = 0;
  job                *pjob;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->job_ids[i] == NULL))
          continue;

        if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          pthread_mutex_unlock(pa->ai_mutex);
          rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);
          pa = get_jobs_array(&pjob);
          
          if (pjob != NULL)
            {
            if (rc == PBSE_RELAYED_TO_MOM)
              {
              struct batch_request *array_req = NULL;
              
              /* We told modify_job not to call relay_to_mom so we need to contact the mom */
              if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE)
                {
                return(rc);
                }
              
              preq->rq_refcount++;
              if (mom_relay == 0)
                {
                preq->rq_refcount++;
                }
              mom_relay++;
              
              /* The array_req is freed in relay_to_mom (failure)
               * or in issue_Drequest (success) */
              
              if ((rc = relay_to_mom(&pjob, array_req, NULL)))
                {
                snprintf(log_buf,sizeof(log_buf),
                  "Unable to relay information to mom for job '%s'\n",
                  pjob->ji_qs.ji_jobid);
                log_err(rc, __func__, log_buf);
                
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                
                return(rc); /* unable to get to MOM */
                }
              else
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                post_modify_arrayreq(array_req);
                }
              }
            else
              unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }
          else
            pa->job_ids[i] = NULL;

          }
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */