Example #1
0
void update_array_statuses()

  {
  job_array      *pa;
  job            *pjob;
  int             iter = -1;
  unsigned int    running;
  unsigned int    queued;
  unsigned int    complete;
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  char            jobid[PBS_MAXSVRJOBID+1];

  while ((pa = next_array(&iter)) != NULL)
    {
    running  = pa->ai_qs.jobs_running;
    complete = pa->ai_qs.num_failed + pa->ai_qs.num_successful;
    queued   = pa->ai_qs.num_jobs - running - complete;
    
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }

    strcpy(jobid, pa->ai_qs.parent_id);
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    
    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);
      if (running > 0)
        {
        svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE);
        }
      else if ((complete > 0) && 
               (queued == 0))
        {
        svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE);
        }
      else 
        {
        /* default to just calling the array queued */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE);
        }
      }
    } /* END for each array */

  } /* END update_array_statuses() */
Example #2
0
/* return a server's array info struct corresponding to an array id */
job_array *get_array(
    
  char *id)

  {
  job_array *pa;
 
  int        iter = -1;

  while ((pa = next_array(&iter)) != NULL)
    {
    lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);
    
    if (strcmp(pa->ai_qs.parent_id, id) == 0)
      {
      snprintf(pa->ai_qs.parent_id, sizeof(pa->ai_qs.parent_id), "%s", id);
      return(pa);
      }

    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }

  return(NULL);
  } /* END get_array() */
Example #3
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  all_jobs_iterator      *iter;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  {
  all_jobs *ajptr = NULL;

  if (type == tjstQueue)
    ajptr = cntl->sc_pque->qu_jobs;

  else if (type == tjstSummarizeArraysQueue)
    ajptr = cntl->sc_pque->qu_jobs_array_sum;

  else if (type == tjstSummarizeArraysServer)
    ajptr = &array_summary;

  else
    ajptr = &alljobs;

  ajptr->lock();
  iter = ajptr->get_iterator();
  ajptr->unlock();
  }

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }


  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    all_queues_iterator *iter = NULL;

    svr_queues.lock();
    iter = svr_queues.get_iterator();
    svr_queues.unlock();

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      all_jobs_iterator *jobiter = NULL;
      pque->qu_jobs->lock();
      jobiter = pque->qu_jobs->get_iterator();
      pque->qu_jobs->unlock();

      while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);

          delete iter;

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */

    reply_send_svr(preq);

    delete iter;

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          goto nextjob;
          }
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }

      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      delete iter;

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  delete iter;

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystructid[0] != '\0') &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Example #5
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  int                    iter;
  time_t                 time_now = time(NULL);
  long                   poll_jobs = 0;
  char                   job_id[PBS_MAXSVRJOBID+1];
  int                    job_substate = -1;
  time_t                 job_momstattime = -1;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  iter = -1;

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (!poll_jobs)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = svr_find_job(cntl->sc_jobid, FALSE);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);
          }
        else if (type == tjstQueue)
          {
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                break;
                }
              }

            job_array_index++;
            }
          }
        else
          {
          pjob = next_job(&alljobs,&iter);
          }

        }    /* END if (pjob == NULL) */
      else
        {
        strcpy(job_id, pjob->ji_qs.ji_jobid);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
        else if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
                break;
                }
              }
            }
          }
        else
          pjob = next_job(&alljobs,&iter);
          
        }

      if (pjob == NULL)
        break;

      strcpy(job_id, pjob->ji_qs.ji_jobid);
      job_substate = pjob->ji_qs.ji_substate;
      job_momstattime = pjob->ji_momstat;
      strcpy(cntl->sc_jobid, job_id);
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      pjob = NULL;

      /* PBS_RESTAT_JOB defaults to 30 seconds */
      if ((job_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - job_momstattime) > JobStatRate))
        {
        /* go to MOM for status */
        if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC)
          break;

        if (rc != 0)
          {
          pjob = svr_find_job(job_id, FALSE);

          rc = 0;

          continue;
          }
        
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (rc != 0)
      {
      if (pa != NULL)
        unlock_ai_mutex(pa, __func__, "2", LOGLEVEL);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,&iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,&iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,&iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }


  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    int  iter = -1;

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      iter = -1;

      while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          if (pa != NULL)
            {
            unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
            }
          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);
          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */
      
    if (pa != NULL)
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

    reply_send_svr(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          unlock_queue(pque, __func__, "not exec", LOGLEVEL);
        
          goto nextjob;
          }

        unlock_queue(pque, __func__, "exec", LOGLEVEL);
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,&iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,&iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #6
0
void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  batch_request         *preq = cntl->sc_origrq;
  svrattrl              *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  job                   *pjob = NULL;

  struct batch_reply    *preply = &preq->rq_reply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type = (enum TJobStatTypeEnum)cntl->sc_type;
  bool                   exec_only = false;

  int                    bad = 0;
  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  int                    job_array_index = -1;
  job_array             *pa = NULL;
  all_jobs_iterator     *iter;

  if (preq->rq_extend != NULL)
    {
    /* FORMAT:  { EXECQONLY } */
    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = true;
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    handle_truncated_qstat(exec_only, cntl->sc_condensed, preq);

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */
  else if (type == tjstJob)
    {
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

    if (pjob != NULL)
      {
      if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad)))
        req_reject(rc, bad, preq, NULL, NULL);
      else
        reply_send_svr(preq);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    else
      {
      req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL);
      }
    }
  else
    {
    if (type == tjstArray)
      {
      pa = get_array(preq->rq_ind.rq_status.rq_id);

      if (pa == NULL)
        {
        req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
        return;
        }
      }
    else if ((type == tjstSummarizeArraysQueue) || 
             (type == tjstSummarizeArraysServer))
      update_array_statuses();

    iter = get_correct_status_iterator(cntl);

    for (pjob = get_next_status_job(cntl, job_array_index, pa, iter);
         pjob != NULL;
         pjob = get_next_status_job(cntl, job_array_index, pa, iter))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      /* go ahead and build the status reply for this job */
      if (pjob->ji_being_recycled == true)
        continue;

      if (exec_only)
        {
        if (cntl->sc_pque != NULL)
          {
          if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
            continue;
          }
        else if (in_execution_queue(pjob, pa) == false)
          continue;
        }

      rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad);

      if ((rc != PBSE_NONE) && 
          (rc != PBSE_PERM))
        {
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        req_reject(rc, bad, preq, NULL, NULL);

        delete iter;

        return;
        }
      }  /* END for (pjob != NULL) */

    delete iter;

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
   
    reply_send_svr(preq);
    }

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #7
0
int is_array(
    
  char *id)

  {
  job_array      *pa;

  int             iter = -1;

  char      *bracket_ptr;
  char      *end_bracket_ptr;
  char       jobid[PBS_MAXSVRJOBID];
  char       temp_jobid[PBS_MAXSVRJOBID];

  snprintf(jobid, sizeof(jobid), "%s", id);

  /* Check to see if we have an array dependency */
  /* If there is an array dependency count then we will */
  /* have an id of something like arrayid[][1]. We need to take */
  /* off the [1] so we can compare the array id with and existing */
  /* array entry. */
  if ((bracket_ptr = strchr(jobid,'[')) != NULL)
    {
    /* Make sure the next character is ']' */
    if (*(++bracket_ptr) != ']')
      {
      /* If we do not have a ']' then we have bad syntax. */
      return(FALSE);
      }

    if (*(++bracket_ptr) == '[')
      {
      /* we made it to here. That means we have a count inside
         brackets. Just truncate them for the name comparison */
      end_bracket_ptr = strchr(bracket_ptr, ']');
      if (end_bracket_ptr == NULL)
        {
        /*  we do not have a ']' then we have bad syntax. */
        return(FALSE);
        }
      /* advance end_bracket_ptr one. We should be either NULL or '.' */
      end_bracket_ptr++;

      /* truncate the string */
      *bracket_ptr = 0; /* this makes jobid just the arrayid name */
      /* append the rest of the job id */
      snprintf(temp_jobid, sizeof(jobid), "%s%s", jobid, end_bracket_ptr);
      snprintf(jobid, sizeof(jobid), "%s", temp_jobid);
      }
    }
  else
    {
    /* No '[' then we do not have an array */
    return (FALSE);
    }

  while ((pa = next_array(&iter)) != NULL)
    {
    lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);
    
    if (strcmp(pa->ai_qs.parent_id, jobid) == 0)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

      return(TRUE);
      }

    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }

  return(FALSE);
  } /* END is_array() */
Example #8
0
int setup_array_struct(
    
  job *pjob)

  {
  job_array          *pa;
  array_request_node *rn;

  int                 bad_token_count;
  int                 array_size;
  int                 rc;
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  long                max_array_size;

    pa = (job_array *)calloc(1,sizeof(job_array));

  pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION;
  
  strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid);
  strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix);
  snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s", get_variable(pjob, pbs_o_host));

  pa->ai_qs.num_cloned = 0;
  CLEAR_HEAD(pa->request_tokens);

  pa->ai_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(pa->ai_mutex,NULL);
  lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);

  if (job_save(pjob, SAVEJOB_FULL, 0) != 0)
    {
    /* the array is deleted in svr_job_purge */
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    svr_job_purge(pjob);
    /* Does job array need to be removed? */

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
        "cannot save job");
      }

    return(1);
    }

  if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa)))
    {
    long max_limit = 0;
    get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit);
    array_delete(pa);

    snprintf(log_buf,sizeof(log_buf),
      "Array %s requested a slot limit above the max limit %ld, rejecting\n",
      pa->ai_qs.parent_id,
      max_limit);

    log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,pa->ai_qs.parent_id,log_buf);

    return(INVALID_SLOT_LIMIT);
    }

  pa->ai_qs.jobs_running = 0;
  pa->ai_qs.num_started = 0;
  pa->ai_qs.num_failed = 0;
  pa->ai_qs.num_successful = 0;
  
  bad_token_count = parse_array_request(
                      pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str,
                      &(pa->request_tokens));

  /* get the number of elements that should be allocated in the array */
  rn = (array_request_node *)GET_NEXT(pa->request_tokens);
  array_size = 0;
  pa->ai_qs.num_jobs = 0;
  while (rn != NULL) 
    {
    if (rn->end > array_size)
      array_size = rn->end;
    /* calculate the actual number of jobs (different from array size) */
    pa->ai_qs.num_jobs += rn->end - rn->start + 1;

    rn = (array_request_node *)GET_NEXT(rn->request_tokens_link);
    }

  /* size of array is the biggest index + 1 */
  array_size++; 

  if (get_svr_attr_l(SRV_ATR_MaxArraySize, &max_array_size) == PBSE_NONE)
    {
    if (max_array_size < pa->ai_qs.num_jobs)
      {
      array_delete(pa);

      return(ARRAY_TOO_LARGE);
      }
    }

  /* initialize the array */
  pa->job_ids = calloc(array_size, sizeof(char *));
  if (pa->job_ids == NULL)
    {
    sprintf(log_buf, "Failed to alloc job_ids: job %s", pjob->ji_qs.ji_jobid);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return(PBSE_MEM_MALLOC);
    }


  /* remember array_size */
  pa->ai_qs.array_size = array_size;

  CLEAR_HEAD(pa->ai_qs.deps);

  array_save(pa);

  if (bad_token_count > 0)
    {
    array_delete(pa);
    return 2;
    }

  pjob->ji_arraystruct = pa;

  insert_array(pa);

  unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

  return(PBSE_NONE);
  } /* END setup_array_struct() */
Example #9
0
/* delete a job array struct from memory and disk. This is used when the number
 *  of jobs that belong to the array becomes zero.
 *  returns zero if there are no errors, non-zero otherwise
 */
int array_delete(
    
  job_array *pa)

  {
  int                      i;
  char                     path[MAXPATHLEN + 1];
  char                     log_buf[LOCAL_LOG_BUF_SIZE];
  array_request_node      *rn;
  struct array_depend     *pdep;
  struct array_depend_job *pdj;

  /* first thing to do is take this out of the servers list of all arrays */
  remove_array(pa);

  /* unlock the mutex and free it */
  unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
  free(pa->ai_mutex);

  /* delete the on disk copy of the struct */
  snprintf(path, sizeof(path), "%s%s%s",
    path_arrays, pa->ai_qs.fileprefix, ARRAY_FILE_SUFFIX);

  if (unlink(path))
    {
    sprintf(log_buf, "unable to delete %s", path);
    log_err(errno, "array_delete", log_buf);
    }

  /* clear array request linked list */
  for (rn = (array_request_node *)GET_NEXT(pa->request_tokens);
       rn != NULL;
       rn = (array_request_node *)GET_NEXT(pa->request_tokens))
    {
    delete_link(&rn->request_tokens_link);
    free(rn);
    }

  /* free the memory for the job pointers */
  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] != NULL)
      free(pa->job_ids[i]);
    }

  free(pa->job_ids);

  /* free the dependencies, if any */
  for (pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps); 
       pdep != NULL;
       pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps))
    {
    delete_link(&pdep->dp_link);

    for (pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs);
         pdj != NULL;
         pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs))
      {
      delete_link(&pdj->dc_link);
      free(pdj);
      }

    free(pdep);
    }

  /* purge the "template" job, 
     this also deletes the shared script file for the array*/
  if (pa->ai_qs.parent_id[0] != '\0')
    {
    job *pjob;
    if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL)
      svr_job_purge(pjob);
    }

  /* free the memory allocated for the struct */
  free(pa);

  return 0;
  } /* END array_delete() */
Example #10
0
void update_array_statuses(
    
  job_array *owned)

  {
  job_array      *pa;
  job            *pj;
  job            *pjob;
  int             i;
  int             iter = -1;
  unsigned int    running;
  unsigned int    queued;
  unsigned int    held;
  unsigned int    complete;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  while ((pa = next_array_check(&iter, owned)) != NULL)
    {
    running = 0;
    queued = 0;
    held = 0;
    complete = 0;
    
    for (i = 0; i < pa->ai_qs.array_size; i++)
      {
      if (pa->job_ids[i] != NULL)
        {
        if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (pj->ji_qs.ji_state == JOB_STATE_RUNNING)
            {
            running++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_QUEUED)
            {
            queued++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_HELD)
            {
            held++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_COMPLETE)
            {
            complete++;
            }
          
          unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
          }
        }
      }
     
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }

    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    
    if ((pjob = svr_find_job(pa->ai_qs.parent_id, TRUE)) != NULL)
      {
      if (running > 0)
        {
        svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE);
        }
      else if (held > 0 && queued == 0 && complete == 0)
        {
        svr_setjobstate(pjob, JOB_STATE_HELD, pjob->ji_qs.ji_substate, FALSE);
        }
      else if (complete > 0 && queued == 0 && held == 0)
        {
        svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE);
        }
      else 
        {
        /* default to just calling the array queued */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE);
        }

      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
      }
      
    if (pa == owned)
      {
      lock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }
  } /* END update_array_statuses() */
Example #11
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job  *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];

#ifndef PBS_MOM
  char       parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "%s appears to be from an old version. Attempting to convert.\n",
      namebuf);

    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf);

      log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "3", LOGLEVEL);
      free(pj->ji_mutex);
#endif

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "4", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "5", LOGLEVEL);
    job_free(pj, FALSE);
#else
    mom_job_free(pj);
#endif


    close(fds);

    return(NULL);
    }

#ifndef PBS_MOM
  /* Comment out the mother superior tracking. Will be debugged later 
  if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {*/
    /* add job to the mother superior list for it's node */
/*    char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    char *end = strchr(ms, '/');

    if (end != NULL)
      *end = '\0';

    if ((end = strchr(ms, '+')) != NULL)
      *end = '\0';

    add_to_ms_list(ms, pj);

    free(ms);
    }*/
#endif

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        namebuf);

    log_err(-1, __func__, log_buf);
    }

#else /* not PBS_MOM */

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      close(fds);
      return NULL;
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }

#endif

  close(fds);

  pj->ji_commit_done = 1;

  /* all done recovering the job */

  job_save(pj, SAVEJOB_FULL, 0);

  return(pj);
  }  /* END job_recov() */
int req_releasearray(

  void *vp) /* I */

  {
  job                  *pjob;
  job_array            *pa;
  char                 *range;
  int                   rc;
  int                   index;
  struct batch_request *preq = (struct batch_request *)vp;

  pa = get_array(preq->rq_ind.rq_release.rq_objname);
  if (pa == NULL)
    {
    req_reject(PBSE_IVALREQ,0,preq,NULL,"Cannot find array");
    return(PBSE_NONE);
    }

  while (TRUE)
    {
    if (((index = first_job_index(pa)) == -1) ||
        (pa->job_ids[index] == NULL))
      {
      unlock_ai_mutex(pa, __func__, (char *)"1", LOGLEVEL);

      return(PBSE_NONE);
      }

    if ((pjob = svr_find_job(pa->job_ids[index], FALSE)) == NULL)
      {
      free(pa->job_ids[index]);
      pa->job_ids[index] = NULL;
      }
    else
      break;
    }

  if (svr_authorize_jobreq(preq, pjob) == -1)
    {
    req_reject(PBSE_PERM,0,preq,NULL,NULL);

    unlock_ai_mutex(pa, __func__, (char *)"2", LOGLEVEL);
    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

    return(PBSE_NONE);
    }

  unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);

  range = preq->rq_extend;
  if ((range != NULL) &&
      (strstr(range,ARRAY_RANGE) != NULL))
    {
    /* parse the array range */
    if ((rc = release_array_range(pa,preq,range)) != 0)
      {
      unlock_ai_mutex(pa, __func__, (char *)"3", LOGLEVEL);

      req_reject(rc,0,preq,NULL,NULL);

      return(PBSE_NONE);
      }
    }
  else if ((rc = release_whole_array(pa,preq)) != 0)
    {
    unlock_ai_mutex(pa, __func__, (char *)"4", LOGLEVEL);

    req_reject(rc,0,preq,NULL,NULL);

    return(PBSE_NONE);
    }
  
  unlock_ai_mutex(pa, __func__, (char *)"5", LOGLEVEL);

  reply_ack(preq);

  return(PBSE_NONE);
  } /* END req_releasearray() */
Example #13
0
int set_array_job_ids(

  job  **pjob,       /* M */
  char  *log_buf,    /* error Buffer */
  size_t buflen)     /* error buffer length */

  {
  int rc = PBSE_NONE;
#ifndef PBS_MOM
  job *pj = *pjob;
  job_array *pa;
  char       parent_id[PBS_MAXSVRJOBID + 1];

  // If this variable isn't set this job isn't actually an array subjob.
  if ((pj->ji_wattr[JOB_ATR_job_array_id].at_flags & ATR_VFLAG_SET) == 0)
    {
    // Check and set if this is the array template job
    char *open_bracket = strchr(pj->ji_qs.ji_jobid, '[');
    if (open_bracket != NULL)
      {
      if (*(open_bracket + 1) == ']')
        pj->ji_is_array_template = true;
      }

    return(rc);
    }

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      snprintf(log_buf, buflen, "Array job missing array struct %s", __func__);
      return(-1);
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = true;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }
#endif /* !PBS_MOM */

  return(rc);
  }