Exemplo n.º 1
0
job *get_next_status_job(

  struct stat_cntl  *cntl,
  int               &job_array_index,
  job_array         *pa,
  all_jobs_iterator *iter)

  {
  job *pjob = NULL;

  if (cntl->sc_type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);
  else if (cntl->sc_type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
  else if (cntl->sc_type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);
  else if (cntl->sc_type == tjstArray)
    {
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs, iter);

  return(pjob);
  } // END get_next_status_job()
Exemplo n.º 2
0
END_TEST

START_TEST(next_job_test)
  {
  struct all_jobs alljobs;
  struct job *result;
  initialize_all_jobs_array(&alljobs);
  result = next_job(NULL,NULL);

  fail_unless(result == NULL, "null input parameters fail");

  result = next_job(&alljobs,NULL);
  fail_unless(result == NULL, "NULL input iterator fail");
  }
Exemplo n.º 3
0
int handle_requeue_all(

  batch_request *preq)

  {
  int                rc;
  job               *pjob;
  all_jobs_iterator *iter;

  if ((preq->rq_perm & (ATR_DFLAG_MGWR)) == 0)
    {
    rc = PBSE_PERM;
    req_reject(rc, 0, preq, NULL, "You must be a manager to requeue all jobs");
    return(rc);
    }

  alljobs.lock();
  iter = alljobs.get_iterator();
  alljobs.unlock();

  while ((pjob = next_job(&alljobs, iter)) != NULL)
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);
    requeue_job_without_contacting_mom(*pjob);
    }

  delete iter;

  reply_ack(preq);

  return(PBSE_NONE);
  } /* END handle_requeue_all() */
Exemplo n.º 4
0
job *next_job(

  struct all_jobs *aj,
  int             *iter)

  {
  job *pjob;

  pthread_mutex_lock(aj->alljobs_mutex);

  pjob = (job *)next_thing(aj->ra,iter);

  pthread_mutex_unlock(aj->alljobs_mutex);

  if (pjob != NULL)
    {
    lock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

    if (pjob->ji_being_recycled == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

      pjob = next_job(aj,iter);
      }
    }

  return(pjob);
  } /* END next_job() */
Exemplo n.º 5
0
/*
 *	Thread function
 *		Each thread on the pool will be running this function since creation
 *		The idea is that each one of them will be waiting for some job to be
 *		added to the queue. When that happens, one of them will acquire the job
 *		and execute it
 */
static void* thread_func(void *args) {
	thread_pool_t* pool =(thread_pool_t*) args;

	while(1) {
		pthread_mutex_lock(&pool->mutex);
		while(pool->queue.length == 0) {
			DEBUG("Wating for jobs...");
			pthread_cond_wait(&pool->has_jobs, &pool->mutex);
		}
		pool->n_threads_working++;
		DEBUG("Got a Job!");

		job_t* job = next_job(&pool->queue);
		if(job == NULL)
			continue;

		pthread_mutex_unlock(&pool->mutex);
		job->func(job->arg);

		pthread_mutex_lock(&pool->mutex);
		pool->n_threads_working--;
		pthread_mutex_unlock(&pool->mutex);
	}

	return NULL;
}
Exemplo n.º 6
0
int line_solve(Puzzle *puz, Solution *sol, int contradicting)
{
    extern dir_t cont_dir;
    extern line_t cont_line;
    dir_t dir;
    line_t i;
    int depth;

    while (next_job(puz, &dir, &i, &depth))
    {
        nlines++;
        if ((VB && !VC) || WL(dir,i))
            printf("*** %s %d\n",CLUENAME(puz->type,dir), i);
        if (VB || WL(dir,i))
            dump_line(stdout,puz,sol,dir,i);

        if (contradicting && depth >= contradepth)
        {
            /* At max depth we just check if the line is solvable */
            line_t *pos, *bcl;
            if (!left_solve(puz, sol, dir, i, 0, &pos, &bcl))
            {
                if ((VC&&VV) || WL(dir,i))
                    printf("C: %s %d OK AT DEPTH %d\n",
                           cluename(puz->type,dir),i,depth);
            }
            else
            {
                if ((VC&&VV) || WL(dir,i))
                    printf("C: %s %d FAILED AT DEPTH %d\n",
                           cluename(puz->type,dir),i,depth);
                if (contradicting) {
                    cont_dir= dir;
                    cont_line= i;
                }
                return 0;
            }
        }
        else if (apply_lro(puz, sol, dir, i, depth + 1))
        {
            /* Found a contradiction */
            if (contradicting) {
                cont_dir= dir;
                cont_line= i;
            }
            return 0;
        }

        if (VJ)
        {
            printf("CURRENT JOBS:\n");
            dump_jobs(stdout,puz);
        }
    }
    return 1;
}
Exemplo n.º 7
0
static void* bdberl_tpool_main(void* arg)
{
    TPool* tpool = (TPool*)arg;

    LOCK(tpool);

    tpool->active_threads++;

    while(1)
    {
        // Check for shutdown...
        if (tpool->shutdown)
        {
            tpool->active_threads--;
            erl_drv_cond_broadcast(tpool->work_cv);
            UNLOCK(tpool);
            return 0;
        }

        // Get the next job
        TPoolJob* job = next_job(tpool);
        if (job)
        {
            // Unlock to avoid blocking others
            UNLOCK(tpool);

            // Invoke the function
            (*(job->main_fn))(job->arg);

            // Relock
            LOCK(tpool);

            // Mark the job as not running (important for cancellation to know it's done)
            job->running = 0;

            // If the job was cancelled, signal the cancellation cv so that anyone waiting on the
            // job knows it's complete
            if (job->canceled)
            {
                erl_drv_cond_broadcast(tpool->cancel_cv);
            }
        
            // Cleanup the job (remove from active list, free, etc.)
            cleanup_job(tpool, job);
        }
        else
        {
            // Wait for a job to come available then jump back to top of loop
            erl_drv_cond_wait(tpool->work_cv, tpool->lock);
        }
    }

    return 0;
}
Exemplo n.º 8
0
job *find_array_template(
    
  char *arrayid)

  {
  char *at;
  char *comp;
  int   different = FALSE;
  int   iter = -1;

  job  *pj;

  if ((at = strchr(arrayid, (int)'@')) != NULL)
    * at = '\0'; /* strip off @server_name */

  if ((is_svr_attr_set(SRV_ATR_display_job_server_suffix) == TRUE) ||
      (is_svr_attr_set(SRV_ATR_job_suffix_alias) == TRUE))
    {
    comp = get_correct_jobname(arrayid);
    different = TRUE;

    if (comp == NULL)
      return NULL;
    }
  else
    {
    comp = arrayid;
    }

  while ((pj = next_job(&array_summary,&iter)) != NULL)
    {
    if (!strcmp(comp, pj->ji_qs.ji_jobid))
      break;

    unlock_ji_mutex(pj, __func__, NULL, LOGLEVEL);
    }

  if (at)
    *at = '@'; /* restore @server_name */

  if (different)
    free(comp);

  return(pj);  /* may be NULL */
  }   /* END find_array_template() */
Exemplo n.º 9
0
static void* thr_fn(void *arg) {
    struct worker_t *w = (struct worker_t*)arg;
    struct job_t job;
    unsigned int sleep_time = 0, slept_time;
    time_t start, now;
    char now_str[128], ap_str[128];
    struct tm now_tm, ap_tm;
    sigset_t mask;

    sigfillset(&mask);
    sigdelset(&mask, SIG_WORKER_INTERRUPT);
    sigdelset(&mask, SIG_WORKER_KILL);
    pthread_sigmask(SIG_BLOCK, &mask, NULL);

    while (!__sync_fetch_and_add(&w->exit_loop, 0)) {
        start = time(NULL);
        do {
            if (sleep_time == 0)
                sleep_time = w->interval;
            next_job(w, &job, &sleep_time);
            if (job.id > 0)
                w->wlog("next job[%d] start in %d seconds\n", job.id, sleep_time);
            sleep_time = sleep(sleep_time);
            if (sleep_time > 0)
                w->wlog("interrupt by signal, %d seconds left\n", sleep_time);
        }
        while (sleep_time > 0);

        now = time(NULL);
        slept_time = now - start;
        strftime(now_str, 128, "%T", localtime_r(&now, &now_tm));
        if (job.id > 0) {
            strftime(ap_str, 128, "%T", localtime_r(&job.ap_time, &ap_tm));
            w->wlog("[%s] do job[%d], appointment time[%s], slept %d seconds\n",
                    now_str, job.id, ap_str, slept_time);
            delete_job(w, job.id);
        } else {
            w->wlog("[%s] slept %d seconds\n", now_str, slept_time);
        }
    }

    return NULL;
}
Exemplo n.º 10
0
job *next_job(

  all_jobs          *aj,
  all_jobs_iterator *iter)

  {
  job *pjob;

  if (aj == NULL)
    {
    log_err(PBSE_BAD_PARAMETER, __func__, "null input pointer to all_jobs struct");
    return(NULL);
    }
  if (iter == NULL)
    {
    log_err(PBSE_BAD_PARAMETER, __func__, "null input iterator");
    return(NULL);
    }

  aj->lock();

  pjob = iter->get_next_item();

  aj->unlock();

  if (pjob != NULL)
    {
    lock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

    if (pjob->ji_being_recycled == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

      pjob = next_job(aj,iter);
      }
    }

  return(pjob);
  } /* END next_job() */
Exemplo n.º 11
0
job *next_job(

  struct all_jobs *aj,
  int             *iter)

  {
  job *pjob;

  if (aj == NULL)
    {
    log_err(PBSE_BAD_PARAMETER, __func__, "null input pointer to all_jobs struct");
    return(NULL);
    }
  if (iter == NULL)
    {
    log_err(PBSE_BAD_PARAMETER, __func__, "null input iterator");
    return(NULL);
    }

  pthread_mutex_lock(aj->alljobs_mutex);

  pjob = (job *)next_thing(aj->ra,iter);

  pthread_mutex_unlock(aj->alljobs_mutex);

  if (pjob != NULL)
    {
    lock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

    if (pjob->ji_being_recycled == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

      pjob = next_job(aj,iter);
      }
    }

  return(pjob);
  } /* END next_job() */
Exemplo n.º 12
0
void handle_truncated_qstat(
    
  bool           exec_only,
  bool           condensed,
  batch_request *preq)

  {
  long                 sentJobCounter = 0;
  long                 qmaxreport;
  all_queues_iterator *queue_iter = NULL;
  pbs_queue           *pque;
  char                 log_buf[LOCAL_LOG_BUF_SIZE];
  job                 *pjob;
  svrattrl            *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  batch_reply         *preply = &preq->rq_reply;
  int                  bad = 0;

  svr_queues.lock();
  queue_iter = svr_queues.get_iterator();
  svr_queues.unlock();

  /* loop through all queues */
  while ((pque = next_queue(&svr_queues, queue_iter)) != NULL)
    {
    long      qjcounter = 0;
    mutex_mgr queue_mutex(pque->qu_mutex, true);

    if ((exec_only == true) &&
        (pque->qu_qs.qu_type != QTYPE_Execution))
      {
      /* ignore routing queues */
      continue;
      }

    if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
        (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
      {
      qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
      }
    else
      {
      qmaxreport = TMAX_JOB;
      }

    if (LOGLEVEL >= 5)
      {
      snprintf(log_buf, sizeof(log_buf), "Reporting up to %ld idle jobs in queue %s\n",
        qmaxreport,
        pque->qu_qs.qu_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
      }

    /* loop through jobs in queue */
    all_jobs_iterator *jobiter = NULL;
    pque->qu_jobs->lock();
    jobiter = pque->qu_jobs->get_iterator();
    pque->qu_jobs->unlock();

    while ((pjob = next_job(pque->qu_jobs, jobiter)) != NULL)
      {
      mutex_mgr job_mgr(pjob->ji_mutex, true);

      if ((qjcounter >= qmaxreport) &&
          (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
        {
        /* max_report of queued jobs reached for queue */
        continue;
        }

      int rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, condensed, &bad);

      if ((rc != 0) &&
          (rc != PBSE_PERM))
        {
        req_reject(rc, bad, preq, NULL, NULL);

        delete queue_iter;

        return;
        }

      sentJobCounter++;

      if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
        qjcounter++;
      } /* END foreach (pjob from pque) */

    if (LOGLEVEL >= 5)
      {
      snprintf(log_buf, sizeof(log_buf), "Reported %ld total jobs for queue %s\n",
        sentJobCounter,
        pque->qu_qs.qu_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
      }
    } /* END for (pque) */

  reply_send_svr(preq);

  delete queue_iter;

  return;
  } // END handle_truncated_qstat()
Exemplo n.º 13
0
void svr_shutdown(

  int type) /* I */

  {
  pbs_attribute *pattr;
  job           *pjob;
  long           state = SV_STATE_DOWN;
  int            iter;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  close(lockfds);

  save_queues();

  /* Lets start by logging shutdown and saving everything */
  get_svr_attr_l(SRV_ATR_State, &state);

  strcpy(log_buf, msg_shutdown_start);

  if (state == SV_STATE_SHUTIMM)
    {
    /* if already shuting down, another Immed/sig will force it */
    if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG))
      {
      state = SV_STATE_DOWN;
      set_svr_attr(SRV_ATR_State, &state);

      strcat(log_buf, "Forced");

      log_event(
        PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
        PBS_EVENTCLASS_SERVER,
        msg_daemonname,
        log_buf);

      return;
      }
    }

  if (type == SHUT_IMMEDIATE)
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Immediate");
    }
  else if (type == SHUT_DELAY)
    {
    state = SV_STATE_SHUTDEL;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Delayed");
    }
  else if (type == SHUT_QUICK)
    {
    state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Quick");
    }
  else
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "By Signal");
    }

  log_event(
    PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
    PBS_EVENTCLASS_SERVER,
    msg_daemonname,
    log_buf);

  if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */
    {
    return;
    }

  svr_save(&server, SVR_SAVE_QUICK);

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL)
    {
    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN;

      pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

      if ((pattr->at_flags & ATR_VFLAG_SET) &&
          ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL)))
        {
        /* do checkpoint of job */

        if (shutdown_checkpoint(&pjob) == 0)
          {
          if (pjob != NULL)
            unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

          continue;
          }
        }

      /* if no checkpoint (not supported, not allowed, or fails */
      /* rerun if possible, else kill job */

      rerun_or_kill(&pjob, msg_on_shutdown);
      }

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    }

  return;
  }  /* END svr_shutdown() */
Exemplo n.º 14
0
void *queue_route(

    void *vp)

{
    pbs_queue *pque;
    job       *pjob = NULL;
    char      *queue_name;
    char       log_buf[LOCAL_LOG_BUF_SIZE];

    all_jobs_iterator   *iter = NULL;

    queue_name = (char *)vp;

    if (queue_name == NULL)
    {
        sprintf(log_buf, "NULL queue name");
        log_err(-1, __func__, log_buf);
        return(NULL);
    }

    while (1)
    {
        pthread_mutex_lock(reroute_job_mutex);
        /* Before we attempt to service this queue, make sure we can find it. */
        pque = find_queuebyname(queue_name);
        if (pque == NULL)
        {
            sprintf(log_buf, "Could not find queue %s", queue_name);
            log_err(-1, __func__, log_buf);
            free(queue_name);
            return(NULL);
        }

        mutex_mgr que_mutex(pque->qu_mutex, true);

        pque->qu_jobs->lock();
        iter = pque->qu_jobs->get_iterator();
        pque->qu_jobs->unlock();

        if (LOGLEVEL >= 7)
        {
            snprintf(log_buf, sizeof(log_buf), "routing any ready jobs in queue: %s", queue_name);
            log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf);
        }

        while ((pjob = next_job(pque->qu_jobs,iter)) != NULL)
        {
            /* We only want to try if routing has been tried at least once - this is to let
             * req_commit have the first crack at routing always. */

            if (pjob->ji_commit_done == 0) /* when req_commit is done it will set ji_commit_done to 1 */
            {
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                continue;
            }
            /* queue must be unlocked when calling reroute_job */
            que_mutex.unlock();
            reroute_job(pjob);
            unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            /* need to relock queue when we go to call next_job */
            pque = find_queuebyname(queue_name);
            if (pque == NULL)
            {
                sprintf(log_buf, "Could not find queue %s", queue_name);
                log_err(-1, __func__, log_buf);
                free(queue_name);
                delete iter;
                return(NULL);
            }
            que_mutex.mark_as_locked();
        }

        /* we come out of the while loop with the queue locked.
           We don't want it locked while we sleep */
        que_mutex.unlock();
        pthread_mutex_unlock(reroute_job_mutex);
        delete iter;
        sleep(route_retry_interval);
    }

    free(queue_name);
    return(NULL);
} /* END queue_route() */
Exemplo n.º 15
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  all_jobs_iterator      *iter;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  {
  all_jobs *ajptr = NULL;

  if (type == tjstQueue)
    ajptr = cntl->sc_pque->qu_jobs;

  else if (type == tjstSummarizeArraysQueue)
    ajptr = cntl->sc_pque->qu_jobs_array_sum;

  else if (type == tjstSummarizeArraysServer)
    ajptr = &array_summary;

  else
    ajptr = &alljobs;

  ajptr->lock();
  iter = ajptr->get_iterator();
  ajptr->unlock();
  }

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }


  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    all_queues_iterator *iter = NULL;

    svr_queues.lock();
    iter = svr_queues.get_iterator();
    svr_queues.unlock();

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      all_jobs_iterator *jobiter = NULL;
      pque->qu_jobs->lock();
      jobiter = pque->qu_jobs->get_iterator();
      pque->qu_jobs->unlock();

      while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);

          delete iter;

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */

    reply_send_svr(preq);

    delete iter;

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          goto nextjob;
          }
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }

      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      delete iter;

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  delete iter;

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Exemplo n.º 16
0
Arquivo: fifo.c Projeto: CESNET/torque
int init_scheduling_cycle(server_info *sinfo)
  {
  group_info *user; /* the user for the running jobs of the last cycle */
  queue_info *qinfo; /* user to cycle through the queues to sort the jobs */
  char decayed = 0; /* boolean: have we decayed usage? */
  time_t t;  /* used in decaying fair share */
  int i, j;

  if (cstat.fair_share)
    {
    if (last_running != NULL)
      {
      /* add the usage which was accumulated between the last cycle and this
       * one and calculate a new value
       */

      for (i = 0; i < last_running_size ; i++)
        {
        job_info** jobs;
        user = last_running[i].ginfo;
#if HIGH_PRECISION_FAIRSHARE
        jobs = sinfo -> jobs; /* check all jobs (exiting, completed, running) */
#else
        jobs = sinfo -> running_jobs; /* check only running */
#endif

        for (j = 0; jobs[j] != NULL; j++)
          {
            if (jobs[j] -> is_completed || jobs[j] -> is_exiting ||
              jobs[j] -> is_running)
              if (!strcmp(last_running[i].name, jobs[j] -> name))
                break;
          }

        if (jobs[j] != NULL)
          {
          user -> usage +=
            calculate_usage_value(jobs[j] -> resused) -
            calculate_usage_value(last_running[i].resused);
          }
        }

      /* assign usage into temp usage since temp usage is used for usage
       * calculations.  Temp usage starts at usage and can be modified later.
       */
      for (i = 0; i < last_running_size; i++)
        last_running[i].ginfo -> temp_usage = last_running[i].ginfo -> usage;
      }

    /* The half life for the fair share tree might have passed since the last
     * scheduling cycle.  For that matter, several half lives could have
     * passed.  If this is the case, perform as many decays as necessary
     */

    t = cstat.current_time;

    while (t - last_decay > conf.half_life)
      {
      sched_log(PBSEVENT_DEBUG2, PBS_EVENTCLASS_SERVER, "", "Decaying Fairshare Tree");
      decay_fairshare_tree(conf.group_root);
      t -= conf.half_life;
      decayed = 1;
      }

    if (decayed)
      {
      /* set the time to the acuall the half-life should have occured */
      last_decay = cstat.current_time -
                   (cstat.current_time - last_decay) % conf.half_life;
      }

    if (cstat.current_time - last_sync > conf.sync_time)
      {
      write_usage();
      last_sync = cstat.current_time;
      sched_log(PBSEVENT_DEBUG2, PBS_EVENTCLASS_SERVER, "", "Usage Sync");
      }
    }

  if (cstat.help_starving_jobs)
    cstat.starving_job = update_starvation(sinfo -> jobs);

  /* sort queues by priority if requested */

  if (cstat.sort_queues)
    qsort(sinfo -> queues, sinfo -> num_queues, sizeof(queue_info *),
          cmp_queue_prio_dsc);


  if (cstat.sort_by[0].sort != NO_SORT)
    {
    if (cstat.by_queue || cstat.round_robin)
      {
      for (i = 0; i < sinfo -> num_queues; i++)
        {
        qinfo = sinfo -> queues[i];
        qsort(qinfo -> jobs, qinfo -> sc.total, sizeof(job_info *), cmp_sort);
        }
      }
    else
      qsort(sinfo -> jobs, sinfo -> sc.total, sizeof(job_info *), cmp_sort);
    }

  next_job(sinfo, INITIALIZE);

  return 1;  /* SUCCESS */
  }
Exemplo n.º 17
0
Arquivo: fifo.c Projeto: CESNET/torque
int scheduling_cycle(

  int sd)

  {
  server_info *sinfo;  /* ptr to the server/queue/job/node info */
  job_info *jinfo;  /* ptr to the job to see if it can run */
  int ret = SUCCESS;  /* return code from is_ok_to_run_job() */
  char log_msg[MAX_LOG_SIZE]; /* used to log an message about job */
  char comment[MAX_COMMENT_SIZE]; /* used to update comment of job */

  sched_log(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", "Entering Schedule");

  update_cycle_status();

  /* create the server / queue / job / node structures */

  if ((sinfo = query_server(sd)) == NULL)
    {
    fprintf(stderr, "Problem with creating server data strucutre\n");

    return(0);
    }

  if (init_scheduling_cycle(sinfo) == 0)
    {
    sched_log(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_SERVER,
      sinfo -> name,
      "init_scheduling_cycle failed.");

    free_server(sinfo, 1);

    return(0);
    }

  /* main scheduling loop */

  while ((jinfo = next_job(sinfo, 0)))
    {
    sched_log(
      PBSEVENT_DEBUG2,
      PBS_EVENTCLASS_JOB,
      jinfo->name,
      "Considering job to run");

    if ((ret = is_ok_to_run_job(sd, sinfo, jinfo->queue, jinfo)) == SUCCESS)
      {
      run_update_job(sd, sinfo, jinfo->queue, jinfo);
      }
    else
      {
      if (jinfo->can_never_run)
        {
        sched_log(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          jinfo->name,
          "Job Deleted because it would never run");

        pbs_deljob(sd, jinfo->name, "Job could never run");
        }

      jinfo->can_not_run = 1;

      if (translate_job_fail_code(ret, comment, log_msg))
        {
        /* if the comment doesn't get changed, its because it hasn't changed.
         * if the reason for the job has not changed, we do not need to log it
         */

        if (update_job_comment(sd, jinfo, comment) == 0)
          {
          sched_log(
            PBSEVENT_SCHED,
            PBS_EVENTCLASS_JOB,
            jinfo->name,
            log_msg);
          }
        }

      if ((ret != NOT_QUEUED) && cstat.strict_fifo)
        {
        update_jobs_cant_run(
          sd,
          jinfo->queue->jobs,
          jinfo,
          COMMENT_STRICT_FIFO,
          START_AFTER_JOB);
        }
      }
    }

  if (cstat.fair_share)
    update_last_running(sinfo);

  free_server(sinfo, 1); /* free server and queues and jobs */

  sched_log(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", "Leaving schedule\n");

  return 0;
  }
Exemplo n.º 18
0
/**
 * @brief Update function called after every event
 *
 * The heart of the scheduler, the actual scheduling algorithm. This will be
 * passed to the event loop as a call back and will be called every time an event
 * is executed. Therefore the code should be light weight since it will be run
 * very frequently.
 *
 * @TODO:
 *   currently this will only grab a job and create a single agent to execute
 *   the job.
 *
 *   @TODO: allow for runonpfile jobs to have multiple agents based on size
 *   @TODO: allow for job preemption. The scheduler can pause jobs, allow it
 *   @TODO: allow for specific hosts to be chosen.
 */
void scheduler_update(scheduler_t* scheduler)
{
  /* queue used to hold jobs if an exclusive job enters the system */
  static job_t*  job  = NULL;
  static host_t* host = NULL;
  static int lockout = 0;

  /* locals */
  int n_agents = g_tree_nnodes(scheduler->agents);
  int n_jobs   = active_jobs(scheduler->job_list);

  /* check to see if we are in and can exit the startup state */
  if(scheduler->s_startup && n_agents == 0)
  {
    event_signal(database_update_event, NULL);
    scheduler->s_startup = 0;
  }

  /* check if we are able to close the scheduler */
  if(closing && n_agents == 0 && n_jobs == 0)
  {
    event_loop_terminate();
    return;
  }

  if(lockout && n_agents == 0 && n_jobs == 0)
    lockout = 0;

  if(job == NULL && !lockout)
  {
    while((job = peek_job(scheduler->job_queue)) != NULL)
    {
      // check if the agent is required to run on local host
      if(is_meta_special(
          g_tree_lookup(scheduler->meta_agents, job->agent_type), SAG_LOCAL))
      {
        host = g_tree_lookup(scheduler->host_list, LOCAL_HOST);
        if(!(host->running < host->max))
        {
          job = NULL;
          break;
        }
      }
      // check if the job is required to run on a specific machine
      else if((job->required_host != NULL))
      {
        host = g_tree_lookup(scheduler->host_list, job->required_host);
        if(host != NULL)
        { 
          if(!(host->running < host->max))
          {
          job = NULL;
          break;
        }
       } else {
         //log_printf("ERROR %s.%d: jq_pk %d jq_host '%s' not in the agent list!\n",
         //  __FILE__, __LINE__, job->id, job->required_host);
         job->message = "ERROR: jq_host not in the agent list!";
         job_fail_event(scheduler, job);
         job = NULL;
         break;
       }
      }
      // the generic case, this can run anywhere, find a place
      else if((host = get_host(&(scheduler->host_queue), 1)) == NULL)
      {
        job = NULL;
        break;
      }

      next_job(scheduler->job_queue);
      if(is_meta_special(
          g_tree_lookup(scheduler->meta_agents, job->agent_type), SAG_EXCLUSIVE))
      {
        V_SCHED("JOB_INIT: exclusive, postponing initialization\n");
        break;
      }

      V_SCHED("Starting JOB[%d].%s\n", job->id, job->agent_type);
      agent_init(scheduler, host, job);
      job = NULL;
    }
  }

  if(job != NULL && n_agents == 0 && n_jobs == 0)
  {
    agent_init(scheduler, host, job);
    lockout = 1;
    job  = NULL;
    host = NULL;
  }

  if(scheduler->s_pause)
  {
    scheduler->s_startup = 1;
    scheduler->s_pause = 0;
  }
}
Exemplo n.º 19
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  int                    iter;
  time_t                 time_now = time(NULL);
  long                   poll_jobs = 0;
  char                   job_id[PBS_MAXSVRJOBID+1];
  int                    job_substate = -1;
  time_t                 job_momstattime = -1;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  iter = -1;

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (!poll_jobs)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = svr_find_job(cntl->sc_jobid, FALSE);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);
          }
        else if (type == tjstQueue)
          {
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                break;
                }
              }

            job_array_index++;
            }
          }
        else
          {
          pjob = next_job(&alljobs,&iter);
          }

        }    /* END if (pjob == NULL) */
      else
        {
        strcpy(job_id, pjob->ji_qs.ji_jobid);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
        else if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
                break;
                }
              }
            }
          }
        else
          pjob = next_job(&alljobs,&iter);
          
        }

      if (pjob == NULL)
        break;

      strcpy(job_id, pjob->ji_qs.ji_jobid);
      job_substate = pjob->ji_qs.ji_substate;
      job_momstattime = pjob->ji_momstat;
      strcpy(cntl->sc_jobid, job_id);
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      pjob = NULL;

      /* PBS_RESTAT_JOB defaults to 30 seconds */
      if ((job_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - job_momstattime) > JobStatRate))
        {
        /* go to MOM for status */
        if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC)
          break;

        if (rc != 0)
          {
          pjob = svr_find_job(job_id, FALSE);

          rc = 0;

          continue;
          }
        
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (rc != 0)
      {
      if (pa != NULL)
        unlock_ai_mutex(pa, __func__, "2", LOGLEVEL);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,&iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,&iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,&iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }


  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    int  iter = -1;

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      iter = -1;

      while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          if (pa != NULL)
            {
            unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
            }
          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);
          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */
      
    if (pa != NULL)
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

    reply_send_svr(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          unlock_queue(pque, __func__, "not exec", LOGLEVEL);
        
          goto nextjob;
          }

        unlock_queue(pque, __func__, "exec", LOGLEVEL);
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,&iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,&iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Exemplo n.º 20
0
void *queue_route(

  void *vp)

  {
  pbs_queue *pque;
  job       *pjob = NULL;
  char      *queue_name;
  char      log_buf[LOCAL_LOG_BUF_SIZE];

  int       iter = -1;
  time_t    time_now = time(NULL);

  queue_name = (char *)vp;

  if (queue_name == NULL)
    {
    sprintf(log_buf, "NULL queue name");
    log_err(-1, __func__, log_buf);
    return(NULL);
    }

  if (LOGLEVEL >= 7)
    {
    snprintf(log_buf, sizeof(log_buf), "queue name: %s", queue_name);
    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf);
    }
  
  pthread_mutex_lock(reroute_job_mutex);

  pque = find_queuebyname(queue_name);
  if (pque == NULL)
    {
    sprintf(log_buf, "Could not find queue %s", queue_name);
    log_err(-1, __func__, log_buf);
    free(queue_name);
    pthread_mutex_unlock(reroute_job_mutex);
    return(NULL);
    }

  while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
    {
    /* the second condition says we only want to try if routing
     * has been tried once - this is to let req_commit have the 
     * first crack at routing always */
    unlock_queue(pque, __func__, (char *)NULL, 0);
    if ((pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now - ROUTE_RETRY_TIME) &&
        (pjob->ji_qs.ji_un.ji_routet.ji_rteretry != 0))
      {
      reroute_job(pjob, pque);
      unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
      }
    else
      unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    }

  free(queue_name);
  unlock_queue(pque, __func__, (char *)NULL, 0);
  pthread_mutex_unlock(reroute_job_mutex);
  return(NULL);
  } /* END queue_route() */
Exemplo n.º 21
0
void *delete_all_work(

  void *vp)

  {
  batch_request *preq = (batch_request *)vp;

  if (qdel_all_tracker.start_deleting_all_if_possible(preq->rq_user, preq->rq_perm) == false)
    {
    reply_ack(preq);
    return(NULL);
    }

  batch_request *preq_dup = duplicate_request(preq);
  job           *pjob;
  all_jobs_iterator *iter = NULL;
  int            failed_deletes = 0;
  int            total_jobs = 0;
  int            rc = PBSE_NONE;
  char           tmpLine[MAXLINE];
  char          *Msg = preq->rq_extend;
  
  alljobs.lock();
  iter = alljobs.get_iterator();
  alljobs.unlock();
  while ((pjob = next_job(&alljobs, iter)) != NULL)
    {
    // use mutex manager to make sure job mutex locks are properly handled at exit
    mutex_mgr job_mutex(pjob->ji_mutex, true);
 
    if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS)
      {
      job_mutex.set_unlock_on_exit(false);

      continue;
      }

    if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
      {
      job_mutex.unlock();
      
      if(rc == -1)
        {
        //forced_jobpurge freed preq_dup so reallocate it.
        preq_dup = duplicate_request(preq);
        preq_dup->rq_noreply = TRUE;
        }
      continue;
      }
    
    total_jobs++;
    
    /* mutex is freed below */
    if (rc == PBSE_NONE)
      {
      if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE)
        {
        // execute_job_delete() handles mutex so don't unlock on exit
        job_mutex.set_unlock_on_exit(false);
        reply_ack(preq_dup);
        }
       
      /* preq_dup has been freed at this point. Either reallocate it or set it to NULL*/
      if (rc == PURGE_SUCCESS)
        {
        preq_dup = duplicate_request(preq);
        preq_dup->rq_noreply = TRUE;
        }
      else
        preq_dup = NULL;
      }
    
    if (rc != PURGE_SUCCESS)
      {
      /* duplicate the preq so we don't have a problem with double frees */
      preq_dup = duplicate_request(preq);
      preq_dup->rq_noreply = TRUE;
      
      if ((rc == MOM_DELETE) ||
          (rc == ROUTE_DELETE))
        failed_deletes++;
      }
    }

  delete iter;
  
  qdel_all_tracker.done_deleting_all(preq->rq_user, preq->rq_perm);
  
  if (failed_deletes == 0)
    {
    reply_ack(preq);

    /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() 
     * functions have been called */
    if (rc == PURGE_SUCCESS)
      {
      free_br(preq_dup);
      preq_dup = NULL;
      }
    }
  else
    {
    snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs",
      failed_deletes,
      total_jobs);
    
    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);
    }
    
  /* preq_dup happens at the end of the loop, so free the extra one if
   * it is there */
  if (preq_dup != NULL)
    free_br(preq_dup);

  return(NULL);
  } /* END delete_all_work() */
Exemplo n.º 22
0
void purge_completed_jobs(

  struct batch_request *preq)  /* I */

  {
  job          *pjob;
  char         *time_str;
  time_t        purge_time = 0;
  int           iter;
  char          log_buf[LOCAL_LOG_BUF_SIZE];

  /* get the time to purge the jobs that completed before */
  time_str = preq->rq_extend;
  time_str += strlen(PURGECOMP);
  purge_time = strtol(time_str,NULL,10);
  
  /*
    * Clean unreported capability is only for operators and managers.
    * Check if request is authorized
  */

  if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR|
                    ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0)
    {
    req_reject(PBSE_PERM,0,preq,NULL,
      "must have operator or manager privilege to use -c parameter");
    return;
    }
    
  reply_ack(preq);

  if (LOGLEVEL >= 4)
    {
    sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)",
      (long)purge_time, preq->rq_extend);

    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL) 
    {
    if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) &&
        (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) &&
        ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) &&
        (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0))
      {
      if (LOGLEVEL >= 4)
        {
        sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE",
          pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long);
        
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      
      pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1;
      pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
          
      job_save(pjob, SAVEJOB_FULL, 0); 
      }

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }


  return;
  } /* END purge_completed_jobs() */
Exemplo n.º 23
0
int handle_delete_all(

  struct batch_request *preq,
  struct batch_request *preq_tmp,
  char                 *Msg)

  {
  /* don't use the actual request so we can reply about all of the jobs */
  struct batch_request *preq_dup = duplicate_request(preq);
  job                  *pjob;
  int                   iter = -1;
  int                   failed_deletes = 0;
  int                   total_jobs = 0;
  int                   rc = PBSE_NONE;
  char                  tmpLine[MAXLINE];

  preq_dup->rq_noreply = TRUE;
  
  if (preq_tmp != NULL)
    {
    reply_ack(preq_tmp);
    preq->rq_noreply = TRUE; /* set for no more replies */
    }
  
  while ((pjob = next_job(&alljobs, &iter)) != NULL)
    {
    if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS)
      {
      continue;
      }

    if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      
      continue;
      }
    
    total_jobs++;
    
    /* mutex is freed below */
    if (rc == PBSE_NONE)
      {
      if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE)
        reply_ack(preq_dup);
       
      /* mark this as NULL because it has been freed */
      preq_dup = NULL;
      }
    
    if (rc != PURGE_SUCCESS)
      {
      /* duplicate the preq so we don't have a problem with double frees */
      preq_dup = duplicate_request(preq);
      preq_dup->rq_noreply = TRUE;
      
      if ((rc == MOM_DELETE) ||
          (rc == ROUTE_DELETE))
        failed_deletes++;
      }
    }
  
  if (failed_deletes == 0)
    {
    reply_ack(preq);

    /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() 
     * functions have been called */
    if (rc == PURGE_SUCCESS)
      {
      free_br(preq_dup);
      preq_dup = NULL;
      }
    }
  else
    {
    snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs",
      failed_deletes,
      total_jobs);
    
    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);
    }
    
  /* preq_dup happens at the end of the loop, so free the extra one if
   * it is there */
  if (preq_dup != NULL)
    free_br(preq_dup);

  return(PBSE_NONE);
  } /* END handle_delete_all() */
Exemplo n.º 24
0
END_TEST

START_TEST(next_job_test)
  {
  all_jobs alljobs;
  struct job *result;
  result = next_job(NULL,NULL);

  fail_unless(result == NULL, "null input parameters fail");

  result = next_job(&alljobs,NULL);
  fail_unless(result == NULL, "NULL input iterator fail");

  struct job *test_job1 = job_alloc();
  strcpy(test_job1->ji_qs.ji_jobid, "test_job1");
  int rc = insert_job(&alljobs,test_job1);
  fail_unless(rc == PBSE_NONE, "job insert fail1");

  struct job *test_job2 = job_alloc();
  strcpy(test_job2->ji_qs.ji_jobid, "test_job2");
  rc = insert_job(&alljobs,test_job2);
  fail_unless(rc == PBSE_NONE, "job insert fail2");

  struct job *test_job3 = job_alloc();
  strcpy(test_job3->ji_qs.ji_jobid, "test_job3");
  rc = insert_job(&alljobs,test_job3);
  fail_unless(rc == PBSE_NONE, "job insert fai3");

  struct job *test_job4 = job_alloc();
  strcpy(test_job4->ji_qs.ji_jobid, "test_job4");
  rc = insert_job(&alljobs,test_job4);
  fail_unless(rc == PBSE_NONE, "job insert fail4");

  struct job *test_job5 = job_alloc();
  strcpy(test_job5->ji_qs.ji_jobid, "test_job5");
  rc = insert_job(&alljobs,test_job5);
  fail_unless(rc == PBSE_NONE, "job insert fail5");

  /* first transverse to see if we get all 5 jobs */
  all_jobs_iterator *iter;
  alljobs.lock();
  iter = alljobs.get_iterator();
  alljobs.unlock();

  job *pjob = next_job(&alljobs,iter);
  int jobcount = 0;

  while(pjob != NULL)
    {
    jobcount++;
    pjob = next_job(&alljobs,iter);
    }

  fail_unless(jobcount == 5, "Expected job counts to be 5, but it was %d",
    jobcount);

  all_jobs_iterator *iter2;
  alljobs.lock();
  iter2 = alljobs.get_iterator();
  alljobs.unlock();

  /* simulate another thread had added more jobs to the alljobs */
  struct job *test_job6 = job_alloc();
  strcpy(test_job6->ji_qs.ji_jobid, "test_job6");
  rc = insert_job(&alljobs,test_job6);
  fail_unless(rc == PBSE_NONE, "job insert fail6");

  pjob = next_job(&alljobs,iter2);
  jobcount = 0;

  while(pjob != NULL)
    {
    jobcount++;
    fail_unless(pjob->ji_qs.ji_jobid[0] != (char)254, 
      "get_next returned a deleted job");
    pjob = next_job(&alljobs,iter2);
    }

  fail_unless(jobcount == 6, "Expected job counts to be 6, but it was %d",
    jobcount);
  }