Esempio n. 1
0
/**
 * poll_job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {

  job *pjob;

  pjob = (job *)ptask->wt_parm1;

  if (pjob == NULL)
    {
    /* FAILURE */

    return;
    }

  if (server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long &&
      (pjob->ji_qs.ji_state == JOB_STATE_RUNNING))
    {
    stat_mom_job(pjob);
    }

  return;
  }  /* END poll_job_task() */
Esempio n. 2
0
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  int        job_state = -1;

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
      if ((poll_jobs) && (job_state == JOB_STATE_RUNNING))
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          current_poll_job_tasks++;
          pthread_mutex_unlock(poll_job_task_mutex);

          stat_mom_job(job_id);

          pthread_mutex_lock(poll_job_task_mutex);
          current_poll_job_tasks--;
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
Esempio n. 3
0
/**
 * poll_job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  long       job_stat_rate;

  free(ptask->wt_mutex);
  free(ptask);

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      int       job_state = -1;

      job_state = pjob->ji_qs.ji_state;

      // only do things for running jobs
      if (job_state == JOB_STATE_RUNNING)
        {
        job_mutex.unlock();

        get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate);

        if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate)
          {
          get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
          if (poll_jobs)
            stat_mom_job(job_id);
          }

        /* add another task */
        set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }
  }  /* END poll_job_task() */
Esempio n. 4
0
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  int        job_state = -1;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      if (job_state == JOB_STATE_RUNNING)
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) ||
              (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
            {
            pthread_mutex_unlock(poll_job_task_mutex);
            snprintf(log_buf, sizeof(log_buf),
              "Job %s missing MOM's information. Skipping polling on this job", pjob->ji_qs.ji_jobid);
            log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
            } 
          else
            {
            current_poll_job_tasks++;
            pthread_mutex_unlock(poll_job_task_mutex);

            stat_mom_job(job_id);

            pthread_mutex_lock(poll_job_task_mutex);
            current_poll_job_tasks--;
            }
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
Esempio n. 5
0
static void post_sendmom(

  struct work_task *pwt)  /* I */

  {
  char *id = "post_sendmom";

  int  newstate;
  int  newsub;
  int  r;
  int  stat;
  job *jobp = (job *)pwt->wt_parm1;

  struct batch_request *preq = (struct batch_request *)pwt->wt_parm2;

  char  *MOMName = NULL;

  int    jindex;
  long DTime = time_now - 10000;

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "entering post_sendmom");
    }

  stat = pwt->wt_aux;

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);
    }
  else
    {
    r = 2;

    /* cannot get child exit status */

    sprintf(log_buffer, msg_badexit,
            stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  /* maintain local struct to associate job id with dispatch time */

  for (jindex = 0;jindex < 20;jindex++)
    {
    if (DispatchJob[jindex] == jobp)
      {
      DTime = DispatchTime[jindex];

      DispatchJob[jindex] = NULL;

      MOMName = DispatchNode[jindex];

      break;
      }
    }

  if (LOGLEVEL >= 1)
    {
    sprintf(log_buffer, "child reported %s for job after %ld seconds (dest=%s), rc=%d",
            (r == 0) ? "success" : "failure",
            time_now - DTime,
            (MOMName != NULL) ? MOMName : "???",
            r);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  switch (r)
    {

    case 0:  /* send to MOM went ok */

      jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART;

      if (preq != NULL)
        reply_ack(preq);

      /* record start time for accounting */

      jobp->ji_qs.ji_stime = time_now;

      /* update resource usage attributes */

      set_resc_assigned(jobp, INCR);

      if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
        {
        /* may be EXITING if job finished first */

        svr_setjobstate(jobp, JOB_STATE_RUNNING, JOB_SUBSTATE_RUNNING);

        /* above saves job structure */
        }

      /* accounting log for start or restart */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
        account_record(PBS_ACCT_RESTRT, jobp, "Restart from checkpoint");
      else
        account_jobstr(jobp);

      /* if any dependencies, see if action required */

      if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags & ATR_VFLAG_SET)
        depend_on_exec(jobp);

      /*
       * it is unfortunate, but while the job has gone into execution,
       * there is no way of obtaining the session id except by making
       * a status request of MOM.  (Even if the session id was passed
       * back to the sending child, it couldn't get up to the parent.)
       */

      jobp->ji_momstat = 0;

      stat_mom_job(jobp);

      break;

    case 10:

      /* NOTE: if r == 10, connection to mom timed out.  Mark node down */

      stream_eof(-1, jobp->ji_qs.ji_un.ji_exect.ji_momaddr, 0);

      /* send failed, requeue the job */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        jobp->ji_qs.ji_jobid,
        "unable to run job, MOM rejected/timeout");

      free_nodes(jobp);

      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT)
        {
        if (preq != NULL)
          req_reject(PBSE_MOMREJECT, 0, preq, MOMName, "connection to mom timed out");

        svr_evaljobstate(jobp, &newstate, &newsub, 1);

        svr_setjobstate(jobp, newstate, newsub);
        }
      else
        {
        if (preq != NULL)
          req_reject(PBSE_BADSTATE, 0, preq, MOMName, "job was aborted by mom");
        }

      break;

    case 1:   /* commit failed */

    default:

      {
      int JobOK = 0;

      /* send failed, requeue the job */

      sprintf(log_buffer, "unable to run job, MOM rejected/rc=%d",
              r);

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        jobp->ji_qs.ji_jobid,
        log_buffer);

      free_nodes(jobp);

      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT)
        {
        if (preq != NULL)
          {
          char tmpLine[1024];

          if (preq->rq_reply.brp_code == PBSE_JOBEXIST)
            {
            /* job already running, start request failed but return success since
             * desired behavior (job is running) is accomplished */

            JobOK = 1;
            }
          else
            {
            sprintf(tmpLine, "cannot send job to %s, state=%s",
                    (MOMName != NULL) ? MOMName : "mom",
                    PJobSubState[jobp->ji_qs.ji_substate]);

            req_reject(PBSE_MOMREJECT, 0, preq, MOMName, tmpLine);
            }
          }

        if (JobOK == 1)
          {
          /* do not re-establish accounting - completed first time job was started */

          /* update mom-based job status */

          jobp->ji_momstat = 0;

          stat_mom_job(jobp);
          }
        else
          {
          svr_evaljobstate(jobp, &newstate, &newsub, 1);

          svr_setjobstate(jobp, newstate, newsub);
          }
        }
      else
        {
        if (preq != NULL)
          req_reject(PBSE_BADSTATE, 0, preq, MOMName, "send failed - abort");
        }

      break;
      }
    }  /* END switch (r) */

  return;
  }  /* END post_sendmom() */