コード例 #1
0
ファイル: schedule.c プロジェクト: gto11520/torque
static int
schedule_restart(Job *joblist)
  {
  char   *id = "schedule_restart";
  Job    *job, *nextjob;
  QueueList *qptr;
  int     found, changed;
  int     local_errno = 0;

  changed = found = 0;

  for (job = joblist; job != NULL; job = nextjob)
    {
    nextjob = job->next;

    if (job->state != 'Q')
      continue;

    /*
     * See if the job is queued on one of the batch queues.  If not,
     * go on to the next job.
     */
    for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next)
      if (strcmp(qptr->queue->qname, job->qname) == 0)
        break;

    if (qptr == NULL)
      continue;

    found++;

    if (schd_SCHED_RESTART_ACTION == SCHD_RESTART_RERUN)
      {
      (void)sprintf(log_buffer, "Restart job '%s' on queue '%s'.",
                    job->jobid, job->qname);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
                 id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      schd_comment_job(job, schd_JobMsg[JOB_RESTARTED],
                       JOB_COMMENT_REQUIRED);

      if (schd_run_job_on(job, job->queue, schd_SCHED_HOST,
                          LEAVE_JOB_COMMENT))
        {
        (void)sprintf(log_buffer,
                      "Unable to run job '%s' on queue '%s'.", job->jobid,
                      job->qname);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   log_buffer);
        }
      else
        changed ++;

      }
    else /* (SCHED_RESTART_ACTION == SCHD_RESTART_RESUBMIT) */
      {
      if (schd_TEST_ONLY)
        {
        DBPRT(("%s: would have moved %s back to queue %s\n", id,
               job->jobid, schd_SubmitQueue->queue->qname));
        }
      else
        {
        /* Move the job back to its originating queue. */
        if (pbs_movejob_err(connector, job->jobid, job->oqueue, NULL, &local_errno) != 0)
          {
          (void)sprintf(log_buffer,
                        "failed to move %s to queue %s, %d", job->jobid,
                        job->oqueue, local_errno);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          DBPRT(("%s: %s\n", id, log_buffer));
          }
        else
          {
          (void)sprintf(log_buffer,
                        "Requeued job '%s' on queue '%s'.", job->jobid,
                        job->oqueue);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
                     id, log_buffer);
          DBPRT(("%s: %s\n", id, log_buffer));
          schd_comment_job(job, schd_JobMsg[JOB_RESUBMITTED],
                           JOB_COMMENT_REQUIRED);
          changed ++;
          }
        }
      }
    }

  if (found)
    {
    if (schd_SCHED_RESTART_ACTION == SCHD_RESTART_RERUN)
      {
      (void)sprintf(log_buffer,
                    "Re-ran %d jobs (of %d) found queued on run queues.\n",
                    changed, found);
      }
    else
      {
      (void)sprintf(log_buffer,
                    "Moved %d queued jobs (of %d) from run queues back to '%s'.\n",
                    changed, found, schd_SubmitQueue->queue->qname);
      }

    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    DBPRT(("%s: %s\n", id, log_buffer));
    }

  return (changed);
  }
コード例 #2
0
ファイル: runjob.c プロジェクト: AlbertDeFusco/torque
int
schd_run_job_on(Job *job, Queue *destq, char *exechost, int set_comment)
  {
  char   *id = "schd_run_job_on";
  char    reason[128], tmp_word[20];
  char   *date;
  Queue  *srcq = NULL;
  int     ret = 0;
  int     local_errno = 0;

  /* Get the datestamp from 'ctime()'.  Remove the trailing '\n'. */
  date = ctime(&schd_TimeNow);
  date[strlen(date) - 1] = '\0';

  if (set_comment)
    {
    sprintf(reason, "Started on %s", date);

    if (job->flags & JFLAGS_PRIORITY)
      {
      strcat(reason, " (EXPRESS/high priority job)");
      }

    if (job->flags & JFLAGS_WAITING)
      {
      strcat(reason, " (long-waiting job)");
      }

    schd_comment_job(job, reason, JOB_COMMENT_REQUIRED);
    }

  /* If this is NOT a suspended job... */
  if (!(job->flags & JFLAGS_SUSPENDED))
    {

    /*
     * If a destination Queue is provided, and it is different from the
     * source queue, then ask PBS to move the job to that queue before
     * running it.
     */
    srcq = job->queue;

    /*
     * Move the job from its queue to the specified run queue.
     */

    if ((destq != NULL) && (strcmp(destq->qname, srcq->qname) != 0))
      {
      if (pbs_movejob_err(connector, job->jobid, destq->qname, NULL, &local_errno))
        {
        (void)sprintf(log_buffer, "move job %s to queue %s failed, %d",
                      job->jobid, destq->qname, local_errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
                   id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));
        return (-1);
        }

      schd_move_job_to(job, destq);
      }

    /*
    * Give the job handle (JOBID) to PBS to run.
    */
    if (pbs_runjob_err(connector, job->jobid, exechost, NULL, &local_errno))
      {
      (void)sprintf(log_buffer, "failed start job %s on queue %s@%s, %d",
                    job->jobid, destq->qname, exechost, local_errno);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      /*
       * Running failed! Move the job back to the source queue (if
       * applicable) before returning. This prevents jobs being left
       * in execution queues.
       */

      if (srcq)
        {
        DBPRT(("Attempting to move job %s back to queue %s\n",
               job->jobid, srcq->qname));

        if (pbs_movejob_err(connector, job->jobid, srcq->qname, NULL, &local_errno))
          {
          (void)sprintf(log_buffer,
                        "failed to move job %s back to queue %s, %d",
                        job->jobid, srcq->qname, local_errno);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          DBPRT(("%s: %s\n", id, log_buffer));
          }

        schd_move_job_to(job, srcq);
        }

      return (-1);
      }

    strcpy(tmp_word, "started");
    }
  else    /* it IS a suspended job */
    {

    schd_move_job_to(job, destq);
    ret = pbs_sigjob(connector, job->jobid, "resume", NULL);

    if (ret)
      {
      sprintf(log_buffer, "resume of job %s FAILED (%d)",
              job->jobid, ret);
      return (-1);
      }

    job->flags &= ~JFLAGS_SUSPENDED;

    strcpy(tmp_word, "resumed");
    }

  /* PBS accepted the job (and presumably will run it). Log the fact. */
  (void)sprintf(log_buffer, "job %s %s on %s@%s", job->jobid, tmp_word,
                destq->qname, exechost);

  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  DBPRT(("%s: %s\n", id, log_buffer));

  /*
   * Change the state of the local representation of the job to "Running".
   */
  job->state = 'R';

  /*
   * Account for the job on this queue's statistics.  'queued' will be
   * bumped up if the queued job was moved to a new destination queue.
   */

  job->queue->queued --;

  job->queue->running ++;

  /* The queue is no longer idle.  Unset the idle timer. */
  job->queue->idle_since = 0;

  return (0);    /* Job successfully started. */
  }