Example #1
0
int
job_resume (struct jobCard *jp)
{
    static char fname[] = "job_resume";
    int rep;

    if (jp->jobSpecs.actPid)
	return 0;

    if (jobsig(jp, SIGCONT, FALSE) < 0)
        return -1;

    SBD_SET_STATE(jp, JOB_STAT_RUN);

    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		      ERR_NO_ERROR);
    if (rep < 0)
        jp->notReported++;
    else {
	if (jp->notReported > 0)
            jp->notReported = 0;
    }
    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Resume job %s",
                               fname, lsb_jobid2str(jp->jobSpecs.jobId));
    return 0;
}
Example #2
0
/**
 * @brief
 * 		Support function for req_stat_job() and stat_a_jobidname().
 * 		Builds status reply for normal job, Array job, and if requested all of the
 * 		subjobs of the array (but not a single or range of subjobs).
 * @par
 * 		Note,  if dohistjobs is not set and the job is history, no status or error
 * 		is returned.  If an error return is needed, the caller must make that check.
 *
 * @param[in,out]	preq	-	pointer to the stat job batch request, reply updated
 * @param[in]	pjob	-	pointer to the job to be statused
 * @param[in]	dohistjobs	-	flag to include job if it is a history job
 * @param[in]	dosubjobs	-	flag to expand a Array job to include all subjobs
 *
 * @return	int
 * @retval	PBSE_NONE (0)	: no error
 * @retval	non-zero	: PBS error code to return to client
 */
static int
do_stat_of_a_job(struct batch_request *preq, job *pjob, int dohistjobs, int dosubjobs)
{
	int       indx;
	svrattrl *pal;
	int       rc;
	struct batch_reply *preply = &preq->rq_reply;

	/* if history job and not asking for them, just return */
	if ((!dohistjobs) &&
			((pjob->ji_qs.ji_state == JOB_STATE_FINISHED) ||
			(pjob->ji_qs.ji_state == JOB_STATE_MOVED))) {
		return (PBSE_NONE);	/* just return nothing */
	}

	if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) == 0) {
		/* this is not a subjob, go ahead and  */
		/* build the status reply for this job */

		pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

		rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, &bad);
		if (dosubjobs && (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) &&
			((rc == PBSE_NONE) || (rc != PBSE_PERM)) && pjob->ji_ajtrk != NULL) {

		    for (indx=0; indx<pjob->ji_ajtrk->tkm_ct; ++indx) {
			 rc = status_subjob(pjob, preq, pal, indx, &preply->brp_un.brp_status, &bad);
			    if (rc && (rc != PBSE_PERM))
				break;
		    }
		}
		if (rc && (rc != PBSE_PERM)) {
			return (rc);
		}
	}
	return (PBSE_NONE);
}
Example #3
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  all_jobs_iterator      *iter;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  {
  all_jobs *ajptr = NULL;

  if (type == tjstQueue)
    ajptr = cntl->sc_pque->qu_jobs;

  else if (type == tjstSummarizeArraysQueue)
    ajptr = cntl->sc_pque->qu_jobs_array_sum;

  else if (type == tjstSummarizeArraysServer)
    ajptr = &array_summary;

  else
    ajptr = &alljobs;

  ajptr->lock();
  iter = ajptr->get_iterator();
  ajptr->unlock();
  }

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }


  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    all_queues_iterator *iter = NULL;

    svr_queues.lock();
    iter = svr_queues.get_iterator();
    svr_queues.unlock();

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      all_jobs_iterator *jobiter = NULL;
      pque->qu_jobs->lock();
      jobiter = pque->qu_jobs->get_iterator();
      pque->qu_jobs->unlock();

      while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);

          delete iter;

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */

    reply_send_svr(preq);

    delete iter;

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          goto nextjob;
          }
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }

      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      delete iter;

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  delete iter;

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #4
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  int                    iter;
  time_t                 time_now = time(NULL);
  long                   poll_jobs = 0;
  char                   job_id[PBS_MAXSVRJOBID+1];
  int                    job_substate = -1;
  time_t                 job_momstattime = -1;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  iter = -1;

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (!poll_jobs)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = svr_find_job(cntl->sc_jobid, FALSE);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);
          }
        else if (type == tjstQueue)
          {
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                break;
                }
              }

            job_array_index++;
            }
          }
        else
          {
          pjob = next_job(&alljobs,&iter);
          }

        }    /* END if (pjob == NULL) */
      else
        {
        strcpy(job_id, pjob->ji_qs.ji_jobid);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
        else if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
                break;
                }
              }
            }
          }
        else
          pjob = next_job(&alljobs,&iter);
          
        }

      if (pjob == NULL)
        break;

      strcpy(job_id, pjob->ji_qs.ji_jobid);
      job_substate = pjob->ji_qs.ji_substate;
      job_momstattime = pjob->ji_momstat;
      strcpy(cntl->sc_jobid, job_id);
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      pjob = NULL;

      /* PBS_RESTAT_JOB defaults to 30 seconds */
      if ((job_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - job_momstattime) > JobStatRate))
        {
        /* go to MOM for status */
        if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC)
          break;

        if (rc != 0)
          {
          pjob = svr_find_job(job_id, FALSE);

          rc = 0;

          continue;
          }
        
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (rc != 0)
      {
      if (pa != NULL)
        unlock_ai_mutex(pa, __func__, "2", LOGLEVEL);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,&iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,&iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,&iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }


  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    int  iter = -1;

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      iter = -1;

      while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          if (pa != NULL)
            {
            unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
            }
          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);
          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */
      
    if (pa != NULL)
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

    reply_send_svr(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          unlock_queue(pque, __func__, "not exec", LOGLEVEL);
        
          goto nextjob;
          }

        unlock_queue(pque, __func__, "exec", LOGLEVEL);
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,&iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,&iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #5
0
/**
 * @brief
 * 		status_subjob - status a single subjob (of an Array Job)
 *		Works by statusing the parrent unless subjob is actually running.
 *
 * @param[in,out]	pjob	-	ptr to parent Array
 * @param[in]	preq	-	request structure
 * @param[in]	pal	-	specific attributes to status
 * @param[in]	subj	-	if not = -1 then include subjob [n]
 * @param[in,out]	pstathd	-	RETURN: head of list to append status to
 * @param[out]	bad	-	RETURN: index of first bad attribute
 *
 * @return	int
 * @retval	0	: success
 * @retval	PBSE_PERM	: client is not authorized to status the job
 * @retval	PBSE_SYSTEM	: memory allocation error
 * @retval	PBSE_IVALREQ	: something wrong with the flags
 */
int
status_subjob(job *pjob, struct batch_request *preq, svrattrl *pal, int subj, pbs_list_head *pstathd, int *bad)
{
	int		   limit = (int)JOB_ATR_LAST;
	struct brp_status *pstat;
	job		  *psubjob;	/* ptr to job to status */
	char		   realstate;
	int		   rc = 0;
	int		   oldeligflags = 0;
	int		   oldatypflags = 0;
	int 		   subjob_state = -1;
	char 		   *old_subjob_comment = NULL;

	/* see if the client is authorized to status this job */

	if (! server.sv_attr[(int)SRV_ATR_query_others].at_val.at_long)
		if (svr_authorize_jobreq(preq, pjob))
			return (PBSE_PERM);

	if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) == 0)
		return PBSE_IVALREQ;

	/* if subjob is running, use real job structure */

	if (get_subjob_state(pjob, subj) == JOB_STATE_RUNNING) {
		psubjob = find_job(mk_subjob_id(pjob, subj));
		if (psubjob)
			status_job(psubjob, preq, pal, pstathd, bad);
		return 0;
	}

	/* otherwise we fake it with info from the parent      */
	/* allocate reply structure and fill in header portion */


	/* for the general case, we don't want to include the parent's */
	/* array related attrbutes as they belong only to the Array    */
	if (pal == NULL)
		limit = JOB_ATR_array;
	pstat = (struct brp_status *)malloc(sizeof(struct brp_status));
	if (pstat == (struct brp_status *)0)
		return (PBSE_SYSTEM);
	CLEAR_LINK(pstat->brp_stlink);
	pstat->brp_objtype = MGR_OBJ_JOB;
	(void)strcpy(pstat->brp_objname, mk_subjob_id(pjob, subj));
	CLEAR_HEAD(pstat->brp_attr);
	append_link(pstathd, &pstat->brp_stlink, pstat);

	/* add attributes to the status reply */

	*bad = 0;

	/*
	 * fake the job state and comment by setting the parent job's state
	 * and comment to that of the subjob
	 */
	subjob_state = get_subjob_state(pjob, subj);
	realstate = pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char;
	pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = statechars[subjob_state];
	pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE;

	if (subjob_state == JOB_STATE_EXPIRED || subjob_state == JOB_STATE_FINISHED) {
		if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FINISHED) {
			if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) {
				old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str);
				if (old_subjob_comment == (char *)0)
					return (PBSE_SYSTEM);
			}
			if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment],
				(char *)0, (char *)0, "Subjob finished") == PBSE_SYSTEM) {
				free(old_subjob_comment);
				return (PBSE_SYSTEM);
			}
		} else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_FAILED) {
			if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) {
				old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str);
				if (old_subjob_comment == (char *)0)
					return (PBSE_SYSTEM);
			}
			if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment],
				(char *)0, (char *)0, "Subjob failed") == PBSE_SYSTEM) {
				free(old_subjob_comment);
				return (PBSE_SYSTEM);
			}
		} else if (pjob->ji_ajtrk->tkm_tbl[subj].trk_substate == JOB_SUBSTATE_TERMINATED) {
			if (pjob->ji_wattr[(int)JOB_ATR_Comment].at_flags & ATR_VFLAG_SET) {
				old_subjob_comment = strdup(pjob->ji_wattr[(int)JOB_ATR_Comment].at_val.at_str);
				if (old_subjob_comment == (char *)0)
					return (PBSE_SYSTEM);
			}
			if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment],
				(char *)0, (char *)0, "Subjob terminated") == PBSE_SYSTEM) {
				free(old_subjob_comment);
				return (PBSE_SYSTEM);
			}
		}
	}

	/* when eligible_time_enable is off,				      */
	/* clear the set flag so that eligible_time and accrue_type dont show */
	if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) {
		oldeligflags = pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags;
		pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags &= ~ATR_VFLAG_SET;
		pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;

		oldatypflags = pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags;
		pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags &= ~ATR_VFLAG_SET;
		pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags |= ATR_VFLAG_MODCACHE;

		/* Note: ATR_VFLAG_MODCACHE must be set because of svr_cached() does */
		/* 	 not correctly check ATR_VFLAG_SET */
	}

	if (status_attrib(pal, job_attr_def, pjob->ji_wattr, limit,
		preq->rq_perm, &pstat->brp_attr, bad))
		rc =  PBSE_NOATTR;

	/* Set the parent state back to what it really is */

	pjob->ji_wattr[(int)JOB_ATR_state].at_val.at_char = realstate;
	pjob->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_MODCACHE;

	/* Set the parent comment back to what it really is */
	if (old_subjob_comment != NULL) {
		if (job_attr_def[(int)JOB_ATR_Comment].at_decode(&pjob->ji_wattr[(int)JOB_ATR_Comment],
			(char *)0, (char *)0, old_subjob_comment) == PBSE_SYSTEM) {
			free(old_subjob_comment);
			return (PBSE_SYSTEM);
		}

		free(old_subjob_comment);
	}

	/* reset the flags */
	if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 0) {
		pjob->ji_wattr[(int)JOB_ATR_eligible_time].at_flags = oldeligflags;
		pjob->ji_wattr[(int)JOB_ATR_accrue_type].at_flags = oldatypflags;
	}

	return (rc);
}
Example #6
0
static void
chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed)
{
    static char fname[] = "chkpntEnd()";
    int savePid, saveStatus;


    if (IS_SUSP(jobCard->jobSpecs.jStatus)
       && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG))
        jobsig(jobCard, SIGSTOP, TRUE);

    saveStatus = jobCard->jobSpecs.jStatus;
    if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) {
        if (w_status == 0)  {
            if (!jobCard->missing) {

                jobCard->missing = TRUE;
                need_checkfinish = TRUE;
                return;
            } else if (jobCard->notReported == 0)
                return;

            if (jobCard->cleanupPid == 0) {
                if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0)
                    return;

                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709,
                    "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */
                    fname, lsb_jobid2str(jobCard->jobSpecs.jobId));
            }

            SBD_SET_STATE(jobCard, JOB_STAT_PEND);
        } else {
            jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG;
        }
    }

    savePid = jobCard->jobSpecs.actPid;

    if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus,
                    w_status == 0 ? ERR_NO_ERROR :
                    ERR_SYSACT_FAIL) < 0) {
        jobCard->jobSpecs.actPid = savePid;
        jobCard->jobSpecs.jStatus = saveStatus;
    } else {
        jobCard->lastChkpntTime = now;
        jobCard->jobSpecs.actPid = 0;
        jobCard->actStatus = ACT_NO;
        jobCard->jobSpecs.actValue = SIG_NULL;

        if (w_status == 0) {

            jobCard->migCnt = 1;
        }

        if (saveStatus & JOB_STAT_MIG) {
            if (w_status == 0) {

                cleanupMigJob(jobCard);
		deallocJobCard(jobCard);
		*freed = TRUE;
            } else
                jobCard->migCnt *= 2;
        }
    }

}
Example #7
0
void
do_jobSetup(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char       fname[] = "do_jobSetup()";
    struct jobSetup   jsetup;
    struct jobCard    *jp = NULL;
    char              found = FALSE;
    struct jobCard    savejp;

    if (logclass & LC_EXEC)
        ls_syslog(LOG_DEBUG, "%s: Entering ...", fname);

    if (!xdr_jobSetup(xdrs, &jsetup, reqHdr)) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSetup");
        return;
    }

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jsetup.jobId)
            continue;
        found = TRUE;
        break;
    }

    if (found == FALSE) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5838,
                                         "%s: Job <%s> is not found"), /* catgets 5838 */
                  fname, lsb_jobid2str(jsetup.jobId));
        replyHdrWithRC(LSBE_NO_JOB, chfd, jsetup.jobId);
        return;
    }
    if (jp->jobSpecs.actPid)
        return;

    memcpy((char *) &savejp, (char *) jp, sizeof(savejp));

    jp->execJobFlag |= JOB_EXEC_QPRE_KNOWN;
    if (jsetup.execJobFlag & JOB_EXEC_QPRE_OK)
        jp->execJobFlag |= JOB_EXEC_QPRE_OK;

    jp->jobSpecs.jobPid = jsetup.jobPid;
    jp->jobSpecs.jobPGid = jsetup.jobPGid;
    jp->jobSpecs.execUid = jsetup.execUid;
    strcpy(jp->jobSpecs.execUsername, jsetup.execUsername);
    jp->execGid = jsetup.execGid;
    strcpy(jp->execUsername, jsetup.execUsername);
    strcpy(jp->jobSpecs.execCwd, jsetup.execCwd);
    strcpy(jp->jobSpecs.execHome, jsetup.execHome);

    if (jsetup.jStatus & JOB_STAT_RUN) {
        if (!(jsetup.jStatus & JOB_STAT_PRE_EXEC))
            jp->jobSpecs.jStatus &= ~JOB_STAT_PRE_EXEC;

	if (status_job(BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
                       ERR_NO_ERROR) < 0) {
            memcpy((char *) jp, (char *) &savejp, sizeof(savejp));
            return;
        }
        jp->execJobFlag |= JOB_EXEC_STARTED;

    } else {
        jp->jobSpecs.reasons = jsetup.reason;
        jp->collectedChild = TRUE;
        jp->notReported = 0;
        jp->exitPid = -1;
        jp->needReportRU = FALSE;
        jp->jobSpecs.jStatus = jsetup.jStatus;
        jp->w_status = jsetup.w_status;
        jp->lsfRusage = jsetup.lsfRusage;
        jp->cpuTime = jsetup.cpuTime;
        if (job_finish(jp, TRUE) < 0) {
            memcpy((char *) jp, (char *) &savejp, sizeof(savejp));
            return;
        }
    }

    if (replyHdrWithRC(LSBE_NO_ERROR, chfd, jsetup.jobId) < 0) {
        ls_syslog(LOG_DEBUG, "%s: Reply header failed for job <%s>",
                  fname, lsb_jobid2str(jsetup.jobId));
    }
    if (logclass & LC_EXEC)
        ls_syslog(LOG_DEBUG1, "%s: JobId %s jstatus %d reason %x  jobPid %d jobPGid %d execUid %d execGid <%d> execUser <%s> execHome <%s> execCwd <%s> execJobFlag %x cpuTime %f w_status %d",
                  fname, lsb_jobid2str(jsetup.jobId), jsetup.jStatus,
                  jsetup.reason, jsetup.jobPid,
                  jsetup.jobPGid, jsetup.execUid, jsetup.execGid,
                  jsetup.execUsername, jsetup.execHome, jsetup.execCwd,
                  jsetup.execJobFlag, jsetup.cpuTime, jsetup.w_status);
}
Example #8
0
void
do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr)
{
    static char        fname[] = "do_newjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSpecs    jobSpecs;
    struct jobReply    jobReply;
    struct jobCard     *jp;
    sbdReplyType       reply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));

    if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs");
	goto sendReply;
    }


    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId == jobSpecs.jobId) {

	    jobReply.jobId = jp->jobSpecs.jobId;
	    jobReply.jobPid = jp->jobSpecs.jobPid;
	    jobReply.jobPGid = jp->jobSpecs.jobPGid;
	    jobReply.jStatus = jp->jobSpecs.jStatus;
	    reply = ERR_NO_ERROR;
	    goto sendReply;
	}
    }

    jp = calloc(1, sizeof(struct jobCard));
    if (jp == NULL) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jobSpecs.jobId), "calloc");
	reply = ERR_MEM;
	goto sendReply;
    }

    memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs,
	   sizeof(struct jobSpecs));

    jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
    jp->jobSpecs.startTime = now;
    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    /* Initialize the core number
     */
    jp->core_num = -1;

    if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	if (lockHosts (jp) < 0) {
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                      lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts");
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	    reply = ERR_LOCK_FAIL;
	    freeWeek(jp->week);
	    FREEUP(jp);
	    goto sendReply;
        }
    }
    jp->runTime = 0;
    if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) {

	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	    unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	FREEUP(jp);
	goto sendReply;
    }

    jp->execJobFlag = 0;

    if (jp->runTime < 0) {
        jp->runTime = 0;
    }
    jp->execGid = 0;
    jp->execUsername[0] = '\0';
    jp->jobSpecs.execUid = -1;
    jp->jobSpecs.execUsername[0] = '\0';

    if (jp->jobSpecs.jobSpoolDir[0] != '\0') {
        char *tmp;

        if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) {

            jp->jobSpecs.jobSpoolDir[0] = '\0';
        }
    }

    if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) {
        ls_syslog(LOG_DEBUG,
                  "%s: the SpoolDir for  job <%s>  is %s \n",
                  fname, lsb_jobid2str(jp->jobSpecs.jobId),
                  jp->jobSpecs.jobSpoolDir);
    }
    if (jp->jobSpecs.options & SUB_PRE_EXEC)
	SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC))
        else
            SBD_SET_STATE(jp, JOB_STAT_RUN);

    reply = job_exec(jp, chfd);

    if (reply != ERR_NO_ERROR) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "job_exec");
	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	deallocJobCard(jp);
    } else {
	jobReply.jobId = jp->jobSpecs.jobId;
	jobReply.jobPid = jp->jobSpecs.jobPid;
	jobReply.jobPGid = jp->jobSpecs.jobPGid;
	jobReply.jStatus = jp->jobSpecs.jStatus;
    }


sendReply:
    xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr);
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);
    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL;
    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply");
	lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804,
			       "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805,
					 "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */
		  fname, XDR_GETPOS(&xdrs2));
    }

    xdr_destroy(&xdrs2);


    if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue &&
	PURE_INTERACTIVE(&jp->jobSpecs)) {
  	if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		        ERR_NO_ERROR) < 0) {
            jp->notReported++;
	}
    }

}
Example #9
0
void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  batch_request         *preq = cntl->sc_origrq;
  svrattrl              *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  job                   *pjob = NULL;

  struct batch_reply    *preply = &preq->rq_reply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type = (enum TJobStatTypeEnum)cntl->sc_type;
  bool                   exec_only = false;

  int                    bad = 0;
  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  int                    job_array_index = -1;
  job_array             *pa = NULL;
  all_jobs_iterator     *iter;

  if (preq->rq_extend != NULL)
    {
    /* FORMAT:  { EXECQONLY } */
    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = true;
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    handle_truncated_qstat(exec_only, cntl->sc_condensed, preq);

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */
  else if (type == tjstJob)
    {
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

    if (pjob != NULL)
      {
      if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad)))
        req_reject(rc, bad, preq, NULL, NULL);
      else
        reply_send_svr(preq);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    else
      {
      req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL);
      }
    }
  else
    {
    if (type == tjstArray)
      {
      pa = get_array(preq->rq_ind.rq_status.rq_id);

      if (pa == NULL)
        {
        req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
        return;
        }
      }
    else if ((type == tjstSummarizeArraysQueue) || 
             (type == tjstSummarizeArraysServer))
      update_array_statuses();

    iter = get_correct_status_iterator(cntl);

    for (pjob = get_next_status_job(cntl, job_array_index, pa, iter);
         pjob != NULL;
         pjob = get_next_status_job(cntl, job_array_index, pa, iter))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      /* go ahead and build the status reply for this job */
      if (pjob->ji_being_recycled == true)
        continue;

      if (exec_only)
        {
        if (cntl->sc_pque != NULL)
          {
          if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
            continue;
          }
        else if (in_execution_queue(pjob, pa) == false)
          continue;
        }

      rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad);

      if ((rc != PBSE_NONE) && 
          (rc != PBSE_PERM))
        {
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        req_reject(rc, bad, preq, NULL, NULL);

        delete iter;

        return;
        }
      }  /* END for (pjob != NULL) */

    delete iter;

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
   
    reply_send_svr(preq);
    }

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #10
0
void handle_truncated_qstat(
    
  bool           exec_only,
  bool           condensed,
  batch_request *preq)

  {
  long                 sentJobCounter = 0;
  long                 qmaxreport;
  all_queues_iterator *queue_iter = NULL;
  pbs_queue           *pque;
  char                 log_buf[LOCAL_LOG_BUF_SIZE];
  job                 *pjob;
  svrattrl            *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  batch_reply         *preply = &preq->rq_reply;
  int                  bad = 0;

  svr_queues.lock();
  queue_iter = svr_queues.get_iterator();
  svr_queues.unlock();

  /* loop through all queues */
  while ((pque = next_queue(&svr_queues, queue_iter)) != NULL)
    {
    long      qjcounter = 0;
    mutex_mgr queue_mutex(pque->qu_mutex, true);

    if ((exec_only == true) &&
        (pque->qu_qs.qu_type != QTYPE_Execution))
      {
      /* ignore routing queues */
      continue;
      }

    if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
        (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
      {
      qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
      }
    else
      {
      qmaxreport = TMAX_JOB;
      }

    if (LOGLEVEL >= 5)
      {
      snprintf(log_buf, sizeof(log_buf), "Reporting up to %ld idle jobs in queue %s\n",
        qmaxreport,
        pque->qu_qs.qu_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
      }

    /* loop through jobs in queue */
    all_jobs_iterator *jobiter = NULL;
    pque->qu_jobs->lock();
    jobiter = pque->qu_jobs->get_iterator();
    pque->qu_jobs->unlock();

    while ((pjob = next_job(pque->qu_jobs, jobiter)) != NULL)
      {
      mutex_mgr job_mgr(pjob->ji_mutex, true);

      if ((qjcounter >= qmaxreport) &&
          (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
        {
        /* max_report of queued jobs reached for queue */
        continue;
        }

      int rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, condensed, &bad);

      if ((rc != 0) &&
          (rc != PBSE_PERM))
        {
        req_reject(rc, bad, preq, NULL, NULL);

        delete queue_iter;

        return;
        }

      sentJobCounter++;

      if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
        qjcounter++;
      } /* END foreach (pjob from pque) */

    if (LOGLEVEL >= 5)
      {
      snprintf(log_buf, sizeof(log_buf), "Reported %ld total jobs for queue %s\n",
        sentJobCounter,
        pque->qu_qs.qu_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
      }
    } /* END for (pque) */

  reply_send_svr(preq);

  delete queue_iter;

  return;
  } // END handle_truncated_qstat()
Example #11
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (freed on return) */

  {
  svrattrl        *pal;
  job         *pjob = NULL;

  struct batch_request *preq;

  struct batch_reply   *preply;
  int          rc = 0;

  enum TJobStatTypeEnum type;

  pbs_queue            *pque = NULL;
  int                   exec_only = 0;

  int                   IsTruncated = 0;

  long                  DTime;  /* delta time - only report full attribute list if J->MTime > DTime */

  static svrattrl      *dpal = NULL;
  
  int job_array_index = 0;
  job_array *pa = NULL;
  

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  /* NOTE:  If IsTruncated is true, should walk all queues and walk jobs in each queue
            until max_reported is reached (NYI) */

  if (dpal == NULL)
    {
    /* build 'delta' attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);
    }

  if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = find_job(cntl->sc_jobid);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = find_job(preq->rq_ind.rq_status.rq_id);
          }
        else if (type == tjstQueue)
          {
          pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL)
             job_array_index++;
         
          }
        else
          {
          if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue))
            IsTruncated = TRUE;

          pjob = (job *)GET_NEXT(svr_alljobs);
          }
        }    /* END if (pjob == NULL) */
      else
        {
        /* get next job */

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = (job *)GET_NEXT(pjob->ji_jobque);
        else
          pjob = (job *)GET_NEXT(pjob->ji_alljobs);
          
        if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL)
            ;
          }
        }

      if (pjob == NULL)
        break;

      /* PBS_RESTAT_JOB defaults to 30 seconds */

      if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - pjob->ji_momstat) > JobStatRate))
        {
        /* go to MOM for status */

        strcpy(cntl->sc_jobid, pjob->ji_qs.ji_jobid);

        if ((rc = stat_to_mom(pjob, cntl)) == PBSE_SYSTEM)
          {
          break;
          }

        if (rc != 0)
          {
          rc = 0;

          continue;
          }

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (cntl->sc_conn >= 0)
      svr_disconnect(cntl->sc_conn);  /* close connection to MOM */

    if (rc != 0)
      {
      free(cntl);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if (type == tjstSummarizeArraysQueue || type == tjstSummarizeArraysServer)
    {
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = find_job(preq->rq_ind.rq_status.rq_id);
  else if (type == tjstQueue)
    pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs);
  else if (type == tjstSummarizeArraysQueue)
    pjob = (job *)GET_NEXT(cntl->sc_pque->qu_jobs_array_sum);
  else if (type == tjstSummarizeArraysServer)
    pjob = (job *)GET_NEXT(svr_jobs_array_sum);
  else if (type == tjstArray)
    {
    job_array_index = 0;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL)
        job_array_index++;
    }
  else
    pjob = (job *)GET_NEXT(svr_alljobs);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  free(cntl);

  if ((type == tjstTruncatedServer) || (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;

    /* loop through all queues */

    for (pque = (pbs_queue *)GET_NEXT(svr_queues);
         pque != NULL;
         pque = (pbs_queue *)GET_NEXT(pque->qu_link))
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */

        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buffer,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(
          PBSEVENT_SYSTEM,
          PBS_EVENTCLASS_QUEUE,
          pque->qu_qs.qu_name,
          log_buffer);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */

      for (pjob = (job *)GET_NEXT(pque->qu_jobs);
           pjob != NULL;
           pjob = (job *)GET_NEXT(pjob->ji_jobque))
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;
        }    /* END for (pjob) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buffer,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(
          PBSEVENT_SYSTEM,
          PBS_EVENTCLASS_QUEUE,
          pque->qu_qs.qu_name,
          log_buffer);
        }
      }      /* END for (pque) */

    reply_send(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      pque = find_queuebyname(pjob->ji_qs.ji_queue);

      if (pque->qu_qs.qu_type != QTYPE_Execution)
        goto nextjob;
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && (rc != PBSE_PERM))
      {
      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = (job *)GET_NEXT(pjob->ji_jobque);
    else if (type == tjstSummarizeArraysQueue)
      pjob = (job *)GET_NEXT(pjob->ji_jobque_array_sum);
    else if (type == tjstSummarizeArraysServer)
      pjob = (job *)GET_NEXT(pjob->ji_jobs_array_sum);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size && (pjob = pa->jobs[job_array_index]) == NULL)
        ;
      }
    else
      pjob = (job *)GET_NEXT(pjob->ji_alljobs);

    rc = 0;
    }  /* END while (pjob != NULL) */

 
  reply_send(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */