示例#1
0
int release_whole_array(

  job_array            *pa,   /* I/0 */
  struct batch_request *preq) /* I */

  {
  int  i;
  int  rc;
  job *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      if ((rc = release_job(preq, pjob, pa)) != 0)
        return(rc);
      }
    }

  /* SUCCESS */
  return(PBSE_NONE);
  } /* END release_whole_array */
示例#2
0
文件: req_rerun.c 项目: dkoes/torque
void post_rerun(

  batch_request *preq)

  {
  int   newstate;
  int   newsub;
  job  *pjob;

  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buf, "rerun signal reject by mom: %s - %d", preq->rq_ind.rq_signal.rq_jid, preq->rq_reply.brp_code);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,__func__,log_buf);

    if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE)))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }
    }

  return;
  }  /* END post_rerun() */
示例#3
0
int req_releasejob(

  batch_request *vp) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *preq = (batch_request *)vp;

  pjob = chk_job_request(preq->rq_ind.rq_release.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((rc = release_job(preq, pjob, NULL)) != 0)
    {
    req_reject(rc,0,preq,NULL,NULL);
    }
  else
    {
    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_releasejob() */
示例#4
0
文件: req_rerun.c 项目: dkoes/torque
int handle_requeue_all(

  batch_request *preq)

  {
  int                rc;
  job               *pjob;
  all_jobs_iterator *iter;

  if ((preq->rq_perm & (ATR_DFLAG_MGWR)) == 0)
    {
    rc = PBSE_PERM;
    req_reject(rc, 0, preq, NULL, "You must be a manager to requeue all jobs");
    return(rc);
    }

  alljobs.lock();
  iter = alljobs.get_iterator();
  alljobs.unlock();

  while ((pjob = next_job(&alljobs, iter)) != NULL)
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);
    requeue_job_without_contacting_mom(*pjob);
    }

  delete iter;

  reply_ack(preq);

  return(PBSE_NONE);
  } /* END handle_requeue_all() */
示例#5
0
void job_delete_nanny(

  struct work_task *pwt)

  {
  job                  *pjob;
  const char          *sigk = "SIGKILL";
  char                 *jobid;

  struct batch_request *newreq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  time_t                time_now = time(NULL);
  long                  nanny = FALSE;

  /* short-circuit if nanny isn't enabled */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    jobid = (char *)pwt->wt_parm1;
    
    if (jobid != NULL)
      {
      pjob = svr_find_job(jobid, FALSE);
      
      if (pjob != NULL)
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid);
        log_err(-1, "job nanny", log_buf);
        
        /* build up a Signal Job batch request */
        if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
          {
          strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
          snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk);
          }
        
        issue_signal(&pjob, sigk, post_job_delete_nanny, newreq);
        
        if (pjob != NULL)
          apply_job_delete_nanny(pjob, time_now + 60);
        else
          job_mutex.set_lock_on_exit(false);
        }
      }
    else
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory");
      }
    }
  
  if (pwt->wt_parm1 != NULL)
    free(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END job_delete_nanny() */
示例#6
0
文件: req_modify.c 项目: hocks/torque
void *req_modifyjob(

  batch_request *preq) /* I */

  {
  job       *pjob;
  svrattrl  *plist;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);

  if (pjob == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if (plist == NULL)
    {
    /* nothing to do */
    reply_ack(preq);

    /* SUCCESS */
    return(NULL);
    }

  job_mutex.unlock();

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    /* reply_ack will free preq. We need to copy it before we call reply_ack */
    batch_request *new_preq;

    new_preq = duplicate_request(preq, -1);
    if (new_preq == NULL)
      {
      sprintf(log_buf, "failed to duplicate batch request");
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(NULL);
      }

    get_batch_request_id(new_preq);
    reply_ack(preq);

    new_preq->rq_noreply = TRUE; /* set for no more replies */

    enqueue_threadpool_request((void *(*)(void *))modify_job_work, new_preq);
    } 
  else
    modify_job_work(preq);
  
  return(NULL);
  }  /* END req_modifyjob() */
示例#7
0
void post_modify_req(

  batch_request *preq)

  {
  job  *pjob;
  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

  if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID))
    {
    sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code);

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_modify.rq_objname,
      log_buf);

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    }
  else
    {
    if (preq->rq_reply.brp_code == PBSE_UNKJOBID)
      {
      if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL)
        {
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
        return;
        }
      else
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        if (LOGLEVEL >= 0)
          {
          sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s",
            pjob->ji_qs.ji_jobid,
            PJobState[pjob->ji_qs.ji_state],
            PJobSubState[pjob->ji_qs.ji_substate],
            pjob->ji_qs.ji_destin);

          log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
          }
        }
      }

    reply_ack(preq);
    }

  return;
  }  /* END post_modify_req() */
示例#8
0
int modify_whole_array(

  job_array            *pa,             /* I/O */
  svrattrl             *plist,          /* I */
  struct batch_request *preq,           /* I */
  int                   checkpoint_req) /* I */

  {
  int   i;
  int   rc = PBSE_NONE;
  int   modify_job_rc = PBSE_NONE;
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      batch_request *array_req = duplicate_request(preq, i);
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pthread_mutex_unlock(pa->ai_mutex);
      array_req->rq_noreply = TRUE;
      rc = modify_job((void **)&pjob, plist, array_req, checkpoint_req, NO_MOM_RELAY);
      if (rc != PBSE_NONE)
        {
        modify_job_rc = rc;
        }
      pa = get_jobs_array(&pjob);
      
      if (pa == NULL)
        {
        if (pjob == NULL)
          job_mutex.set_lock_on_exit(false);

        return(PBSE_JOB_RECYCLED);
        }

      if (pjob == NULL)
        {
        pa->job_ids[i] = NULL;
        job_mutex.set_lock_on_exit(false);
        continue;
        }
      }
    } /* END foreach job in array */

  return(modify_job_rc);
  } /* END modify_whole_array() */
示例#9
0
void finish_move_process(

  char          *job_id,
  batch_request *preq,
  long           time,
  const char    *node_name,
  int            status,
  int            type,
  int            mom_err)

  {
  char  log_buf[LOCAL_LOG_BUF_SIZE+1];
  job  *pjob = svr_find_job(job_id, TRUE);

  if (pjob == NULL)
    {
    /* somehow the job has been deleted mid-runjob */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "Job %s was deleted while servicing move request", job_id);

    if (preq != NULL)
      {
      if (mom_err != PBSE_NONE)
        req_reject(mom_err, 0, preq, node_name, log_buf);
      else
        req_reject(PBSE_JOBNOTFOUND, 0, preq, node_name, log_buf);
      }
    }
  else
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    switch (type)
      {
      case MOVE_TYPE_Move:

        finish_moving_processing(pjob, preq, status);
        break;
        
      case MOVE_TYPE_Route:

        finish_routing_processing(pjob, status);
        break;
        
      case MOVE_TYPE_Exec:

        job_mutex.unlock();
        finish_sendmom(job_id, preq, time, node_name, status, mom_err);
        
        break;
      } /* END switch (type) */
    }

  } /* END finish_move_process() */
示例#10
0
/*
 * record_reservation()
 *
 * @pre-cond: pnode and rsv_id must be valid pointers
 * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism
 * and on the job which has the node reserved, or -1 is returned and the reservation
 * is not recorded.
 * @param - pnode the node which is reporting the reservation
 * @param - rsv_id the id of the reservation being reported
 * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise
 */
int record_reservation(

  struct pbsnode *pnode,
  const char     *rsv_id)

  {
  job            *pjob;
  bool            found_job = false;
  char            jobid[PBS_MAXSVRJOBID + 1];

  for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
    /* cray only allows one job per node, so any valid job will be the job that is 
     * reserving this node. */
    job_usage_info *jui = pnode->nd_job_usages[i];
    strcpy(jobid, jui->jobid);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

      /* add environment variable BATCH_PARTITION_ID */
      char buf[1024];
      snprintf(buf, sizeof(buf), "BATCH_PARTITION_ID=%s", rsv_id);
      pbs_attribute  tempattr;
      clear_attr(&tempattr, &job_attr_def[JOB_ATR_variables]);
      job_attr_def[JOB_ATR_variables].at_decode(&tempattr,
        NULL, NULL, buf, 0);

      job_attr_def[JOB_ATR_variables].at_set(
        &pjob->ji_wattr[JOB_ATR_variables], &tempattr, INCR);

      job_attr_def[JOB_ATR_variables].at_free(&tempattr);

      track_alps_reservation(pjob);
      found_job = true;

      job_mutex.unlock(); 
      lock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    else
      lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (found_job == false)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
示例#11
0
文件: req_stat.c 项目: bmdaw/torque
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  int        job_state = -1;

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
      if ((poll_jobs) && (job_state == JOB_STATE_RUNNING))
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          current_poll_job_tasks++;
          pthread_mutex_unlock(poll_job_task_mutex);

          stat_mom_job(job_id);

          pthread_mutex_lock(poll_job_task_mutex);
          current_poll_job_tasks--;
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
int gpu_has_job(

    struct pbsnode *pnode,
    int  gpuid)

{
    job            *pjob;
    char           *gpu_str;
    char           *found_str;
    /* increased so that really high gpu indexes don't bother us */
    char            tmp_str[PBS_MAXHOSTNAME + 10];

    /* check each subnode for a job using a gpuid */
    for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
        // make a copy because we're going to lose the lock below
        job_usage_info jui = pnode->nd_job_usages[i];

        if ((pjob = get_job_from_job_usage_info(&jui, pnode)) != NULL)
        {
            mutex_mgr job_mutex(pjob->ji_mutex, true);

            /* Does this job have this gpuid assigned? */
            if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
                    (pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0)
            {
                gpu_str = pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str;

                if (gpu_str != NULL)
                {
                    snprintf(tmp_str, sizeof(tmp_str), "%s-gpu/%d",
                             pnode->get_name(), gpuid);

                    /* look thru the string and see if it has this host and gpuid.
                     * exec_gpus string should be in format of
                     * <hostname>-gpu/<index>[+<hostname>-gpu/<index>...]
                     */

                    found_str = strstr (gpu_str, tmp_str);
                    if (found_str != NULL)
                    {
                        return(TRUE);
                    }
                }
            }
        }
    } /* END for each job on node */

    return(FALSE);
}
示例#13
0
void *req_messagejob(
    
  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
    return(NULL);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    
    return(NULL);
    }

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
  /* pass the request on to MOM */
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
    {
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    free_br(dup_req);
    }
  else
    {
    post_message_req(dup_req);
    free_br(preq);
    }

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);

  return(NULL);
  } /* END req_messagejob() */
示例#14
0
/**
 * poll_job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  long       job_stat_rate;

  free(ptask->wt_mutex);
  free(ptask);

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      int       job_state = -1;

      job_state = pjob->ji_qs.ji_state;

      // only do things for running jobs
      if (job_state == JOB_STATE_RUNNING)
        {
        job_mutex.unlock();

        get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate);

        if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate)
          {
          get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
          if (poll_jobs)
            stat_mom_job(job_id);
          }

        /* add another task */
        set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }
  }  /* END poll_job_task() */
示例#15
0
void *req_modifyjob(

  batch_request *preq) /* I */

  {
  job       *pjob;
  svrattrl  *plist;

  pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq);

  if (pjob == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  if (plist == NULL)
    {
    /* nothing to do */
    reply_ack(preq);

    /* SUCCESS */
    return(NULL);
    }

  job_mutex.unlock();

  /* If async modify, reply now; otherwise reply is handled later */
  if (preq->rq_type == PBS_BATCH_AsyModifyJob)
    {
    reply_ack(preq);

    preq->rq_noreply = TRUE; /* set for no more replies */

    enqueue_threadpool_request((void *(*)(void *))modify_job_work, preq);
    }
  else
    modify_job_work(preq);
  
  return(NULL);
  }  /* END req_modifyjob() */
示例#16
0
void post_delete_mom2(

  struct work_task *pwt)

  {
  char        *jobid;
  const char *sigk = "SIGKILL";
  char         log_buf[LOCAL_LOG_BUF_SIZE];
  job         *pjob;

  jobid = (char *)pwt->wt_parm1;
  free(pwt->wt_mutex);
  free(pwt);
  
  if (jobid == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return;
    }

  pjob = svr_find_job(jobid, FALSE);
  free(jobid);

  if (pjob != NULL)
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      issue_signal(&pjob, sigk, free_br, NULL);
      
      if (pjob != NULL)
        {
        sprintf(log_buf, msg_delrunjobsig, sigk);
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      }
    
    if (pjob == NULL)
      job_mutex.set_lock_on_exit(false);
    }
  }  /* END post_delete_mom2() */
示例#17
0
/*
 * record_reservation()
 *
 * @pre-cond: pnode and rsv_id must be valid pointers
 * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism
 * and on the job which has the node reserved, or -1 is returned and the reservation
 * is not recorded.
 * @param - pnode the node which is reporting the reservation
 * @param - rsv_id the id of the reservation being reported
 * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise
 */
int record_reservation(

  struct pbsnode *pnode,
  const char     *rsv_id)

  {
  job            *pjob;
  bool            found_job = false;
  char            jobid[PBS_MAXSVRJOBID + 1];

  for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
    /* cray only allows one job per node, so any valid job will be the job that is 
     * reserving this node. */
    job_usage_info *jui = pnode->nd_job_usages[i];
    strcpy(jobid, jui->jobid);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

      track_alps_reservation(pjob);
      found_job = true;

      job_mutex.unlock(); 
      lock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    else
      lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (found_job == false)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
示例#18
0
void *modify_job_work(

  batch_request *vp) /* I */

  {
  job           *pjob;
  svrattrl      *plist;
  int            checkpoint_req = FALSE;
  batch_request *preq = (struct batch_request *)vp;
  
  pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE);

  if (pjob == NULL)
    {
    req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "Job unexpectedly deleted");
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);
  
  /* pbs_mom sets the extend string to trigger copying of checkpoint files */
  if (preq->rq_extend != NULL)
    {
    if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0)
      {
      checkpoint_req = CHK_HOLD;
      }
    else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0)
      {
      checkpoint_req = CHK_CONT;
      }
    }

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  /* modify_job will free preq and respond to it */
  modify_job((void **)&pjob, plist, preq, checkpoint_req, 0);

  return(NULL);
  } /* END modify_job_work() */
示例#19
0
int update_substate_if_needed(

  char *job_id,
  bool &change_substate_on_attempt_to_queue)

  {
  if (change_substate_on_attempt_to_queue == true)
    {
    job *pjob = svr_find_job(job_id, TRUE);

    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
      job_save(pjob, SAVEJOB_QUICK, 0);
      }
    else
      {
      return(PBSE_JOB_RECYCLED);
      }
    }

  return(PBSE_NONE);
  } /* END update_substate_if_needed() */
示例#20
0
int get_mom_node_version(
  
  const char *job_id, 
  int        &version)

  {
  job *pjob;
  pbsnode *pnode;

  pjob = svr_find_job(job_id, TRUE);
  if (pjob == NULL)
    return(PBSE_UNKJOBID);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pnode = find_nodebyname(pjob->ji_qs.ji_destin);
  if (pnode == NULL)
    return(PBSE_UNKNODE);

  mutex_mgr node_mutex(&pnode->nd_mutex, true);
  version = pnode->get_version();

  return(PBSE_NONE);
  }
示例#21
0
int req_movejob(

  batch_request *req) /* I */

  {
  job       *jobp;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  int        local_errno = 0;

  jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req);

  if (jobp == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(jobp->ji_mutex, true);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", jobp->ji_qs.ji_jobid);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  
  if ((jobp->ji_qs.ji_state != JOB_STATE_QUEUED) &&
      (jobp->ji_qs.ji_state != JOB_STATE_HELD) &&
      (jobp->ji_qs.ji_state != JOB_STATE_WAITING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d %s", pbse_to_txt(PBSE_BADSTATE), jobp->ji_qs.ji_state, __func__);

    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }

  /*
   * svr_movejob() does the real work, handles both local and
   * network moves
   */
  
  /* We have found that sometimes the destination queue and the 
     parent queue are the same. If so we do not need to do
     anything else */
  if (strcmp(jobp->ji_qs.ji_queue, req->rq_ind.rq_move.rq_destin) == 0)
    {
    sprintf(log_buf, "Job %s already in queue %s", jobp->ji_qs.ji_jobid, jobp->ji_qs.ji_queue);
    if (LOGLEVEL >= 7)
      {
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }
    
    req_reject(PBSE_JOB_ALREADY_IN_QUEUE, 0, req, NULL, log_buf);
    return(PBSE_NONE);
    }

  switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, &local_errno, req))
    {

    case 0:

      /* success */
      snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob);
      snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager,
        req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);

      reply_ack(req);

      break;

    case - 1:

    case 1:

      /* fail */

      /* NOTE:  can pass detailed response to requestor (NYI) */

      req_reject(local_errno, 0, req, NULL, NULL);

      break;

    case 2:

      /* deferred, will be handled by    */
      /* post_movejob() when the child completes */

      /* NO-OP */

      break;
    }  /* END switch (svr_movejob(jobp,req->rq_ind.rq_move.rq_destin,req)) */

  return(PBSE_NONE);
  }  /* END req_movejob() */
示例#22
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);
  char                 *msg_ptr = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preply = &preq->rq_reply;

  if (preply->brp_un.brp_txt.brp_str != NULL)
    {
    msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL);
  
    if (msg_ptr != NULL)
      msg_ptr += strlen(PBS_MSG_EQUAL);
    }

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) &&
           (preply->brp_code == PBSE_UNKJOBID) &&
           (msg_ptr != NULL) &&
           (!strcmp(msg_ptr,  preq->rq_ind.rq_status.rq_id)))
    {
    /* we sent a stat request, but mom says it doesn't know anything about
       the job */
    if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
      {
      /* job really isn't running any more - mom doesn't know anything about it
         this can happen if a diskless node reboots and the mom_priv/jobs
         directory is cleared, set its state to queued so job_abt doesn't
         think it is still running */
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      snprintf(log_buf, sizeof(log_buf),
        "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld",
        preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
      rel_resc(pjob);
      job_mutex.set_unlock_on_exit(false);
      job_abt(&pjob, "Job does not exist on node");

      /* TODO, if the job is rerunnable we should set its state back to queued */
      }
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id);
    log_err(preply->brp_code, __func__, log_buf);
    }
  
  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
示例#23
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int                   handle = -1;
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;
  char                 *job_momname = NULL;
  job                  *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return(PBSE_JOBNOTFOUND);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || 
      (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
    {
    job_mutex.unlock();
    snprintf(log_buf, sizeof(log_buf),
      "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid);
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return PBSE_BAD_PARAMETER;
    }

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  job_mutex.unlock();

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL);

  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return(rc);
  }  /* END stat_to_mom() */
示例#24
0
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
示例#25
0
void mom_cleanup_checkpoint_hold(

  struct work_task *ptask)

  {
  int            rc = 0;
  job           *pjob;
  char          *jobid;

  batch_request *preq;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  time_t         time_now = time(NULL);

  jobid = (char *)ptask->wt_parm1;
  free(ptask->wt_mutex);
  free(ptask);

  if (jobid == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return;
    }

  pjob = svr_find_job(jobid, FALSE);
  if (pjob == NULL)
    {
    if (LOGLEVEL >= 3)
      {
      sprintf(log_buf,
        "%s:failed to find job\n",
        __func__);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobid,log_buf);
      }
    free(jobid);
    return;
    }
  free(jobid);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "checking mom cleanup job state is %s-%s\n",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  /* 
   * if the job is no longer running then we have recieved the job obit
   * and need to request the mom to clean up after the job
   */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    if ((preq = alloc_br(PBS_BATCH_DeleteJob)) == NULL)
      {
      log_err(-1, __func__, "unable to allocate DeleteJob request - big trouble!");
      }
    else
      {
      strcpy(preq->rq_ind.rq_delete.rq_objname, pjob->ji_qs.ji_jobid);
      /* The preq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if ((rc = relay_to_mom(&pjob, preq, NULL)) != PBSE_NONE)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }
        else
          job_mutex.set_lock_on_exit(false);

        free_br(preq);

        return;
        }
      else
        free_br(preq);

      if ((LOGLEVEL >= 7) &&
          (pjob != NULL))
        {
        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          "requested mom cleanup");
        }
      }
    }
  else
    {
    set_task(WORK_Timed, time_now + 1, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }

  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);
  } /* END mom_cleanup_checkpoint_hold() */
示例#26
0
int send_job_work(

  char           *job_id,
  const char     *node_name, /* I */
  int             type,      /* I */
  int            *my_err,    /* O */
  batch_request  *preq)      /* M */

  {
  int                   rc = LOCUTION_FAIL;
  int                   ret = PBSE_NONE;
  int                   local_errno = 0;
  tlist_head            attrl;

  int                   encode_type;
  int                   mom_err = PBSE_NONE;
  int                   resc_access_perm;
  std::string           script_name;
  char                 *pc;
  char                  stdout_path[MAXPATHLEN + 1];
  char                  stderr_path[MAXPATHLEN + 1];
  char                  chkpt_path[MAXPATHLEN + 1];
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  start_time = time(NULL);
  bool                  attempt_to_queue_job = false;
  bool                  change_substate_on_attempt_to_queue = false;
  bool                  need_to_send_job_script = false;
  bool                  job_has_run = false;
  job                  *pjob = NULL;
  char                  job_destin[PBS_MAXROUTEDEST+1];

  bool                  Timeout = false;
  
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;

  if ((pjob = svr_find_job(job_id, TRUE)) == NULL)
    {
    *my_err = PBSE_JOBNOTFOUND;
    req_reject(-1, 0, preq, NULL, NULL);
    return(PBSE_JOBNOTFOUND);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if (strlen(pjob->ji_qs.ji_destin) != 0)
    strcpy(job_destin, pjob->ji_qs.ji_destin);
  else
    job_destin[0] = '\0';

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;

  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)
    need_to_send_job_script = TRUE;

  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN)
    job_has_run = TRUE;

  if ((job_destin[0] != '\0') && 
      (type != MOVE_TYPE_Exec))
    {
    if ((pc = strchr(job_destin, '@')) != NULL)
      {
      job_momaddr = get_hostaddr(&local_errno, pc + 1);
      job_momport = pbs_server_port_dis;
      }
    }

  /* encode job attributes to be moved */
  CLEAR_HEAD(attrl);

  /* select attributes/resources to send based on move type */
  if (type == MOVE_TYPE_Exec)
    {
    /* moving job to MOM - ie job start */

    resc_access_perm = ATR_DFLAG_MOM;
    encode_type = ATR_ENCODE_MOM;
    }
  else
    {
    /* moving job to alternate server? */
    resc_access_perm =
      ATR_DFLAG_USWR |
      ATR_DFLAG_OPWR |
      ATR_DFLAG_MGWR |
      ATR_DFLAG_SvRD;

    encode_type = ATR_ENCODE_SVR;

    /* clear default resource settings */
    ret = svr_dequejob(pjob, FALSE);
    if (ret)
      {
      job_mutex.set_unlock_on_exit(false);
      return(ret);
      }
    }

  encode_attributes(attrl, pjob, resc_access_perm, encode_type);

  rc = get_job_script_path(pjob, script_name);

  if (rc != PBSE_NONE)
    {
    if (rc == PBSE_JOB_RECYCLED)
      job_mutex.set_unlock_on_exit(false);
  
    free_server_attrs(&attrl);

    return(rc);
    }
  
  if (job_has_run)
    {
    if ((get_job_file_path(pjob, StdOut, stdout_path, sizeof(stdout_path)) != 0) ||
        (get_job_file_path(pjob, StdErr, stderr_path, sizeof(stderr_path)) != 0) ||
        (get_job_file_path(pjob, Checkpoint, chkpt_path, sizeof(chkpt_path)) != 0))
      {
      job_mutex.unlock();
      goto send_job_work_end;
      }
    }

  /* if the job is substate JOB_SUBSTATE_TRNOUTCM it means we are 
   * recovering after being down or a late failure so we just want 
   * to send the "ready-to-commit/commit" */
  if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM)
    {
    attempt_to_queue_job = true;

    if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT)
      change_substate_on_attempt_to_queue = true;
    }
  
  job_mutex.unlock();
  
  rc = send_job_over_network_with_retries(job_id,
                                          job_destin,
                                          attrl,
                                          attempt_to_queue_job,
                                          change_substate_on_attempt_to_queue,
                                          Timeout,
                                          script_name.c_str(),
                                          need_to_send_job_script,
                                          job_has_run,
                                          job_momaddr,
                                          job_momport,
                                          stdout_path,
                                          stderr_path,
                                          chkpt_path,
                                          type,
                                          my_err,
                                          &mom_err);

  if (Timeout == TRUE)
    {
    /* 10 indicates that job migrate timed out, server will mark node down *
          and abort the job - see post_sendmom() */
    sprintf(log_buf, "child timed-out attempting to start job %s", job_id);
    log_ext(*my_err, __func__, log_buf, LOG_WARNING);
    rc = LOCUTION_REQUEUE;
    }
  else if (rc != LOCUTION_SUCCESS)
    {
    if (should_retry_route(*my_err) == -1)
      {
      sprintf(log_buf, "child failed and will not retry job %s", job_id);
      log_err(*my_err, __func__, log_buf);
      rc = LOCUTION_FAIL;
      }
    else
      rc = LOCUTION_REQUEUE;
    }
  
  if (type == MOVE_TYPE_Exec)
    {
    if (node_name != NULL)
      update_failure_counts(node_name, rc);
    else
      update_failure_counts(job_destin, rc);
    }

send_job_work_end:
  finish_move_process(job_id, preq, start_time, node_name, rc, type, mom_err);
  free_server_attrs(&attrl);

  return(rc);
  } /* END send_job_work() */
示例#27
0
void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  batch_request         *preq = cntl->sc_origrq;
  svrattrl              *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  job                   *pjob = NULL;

  struct batch_reply    *preply = &preq->rq_reply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type = (enum TJobStatTypeEnum)cntl->sc_type;
  bool                   exec_only = false;

  int                    bad = 0;
  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  int                    job_array_index = -1;
  job_array             *pa = NULL;
  all_jobs_iterator     *iter;

  if (preq->rq_extend != NULL)
    {
    /* FORMAT:  { EXECQONLY } */
    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = true;
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    handle_truncated_qstat(exec_only, cntl->sc_condensed, preq);

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */
  else if (type == tjstJob)
    {
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

    if (pjob != NULL)
      {
      if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad)))
        req_reject(rc, bad, preq, NULL, NULL);
      else
        reply_send_svr(preq);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    else
      {
      req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL);
      }
    }
  else
    {
    if (type == tjstArray)
      {
      pa = get_array(preq->rq_ind.rq_status.rq_id);

      if (pa == NULL)
        {
        req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
        return;
        }
      }
    else if ((type == tjstSummarizeArraysQueue) || 
             (type == tjstSummarizeArraysServer))
      update_array_statuses();

    iter = get_correct_status_iterator(cntl);

    for (pjob = get_next_status_job(cntl, job_array_index, pa, iter);
         pjob != NULL;
         pjob = get_next_status_job(cntl, job_array_index, pa, iter))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      /* go ahead and build the status reply for this job */
      if (pjob->ji_being_recycled == true)
        continue;

      if (exec_only)
        {
        if (cntl->sc_pque != NULL)
          {
          if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
            continue;
          }
        else if (in_execution_queue(pjob, pa) == false)
          continue;
        }

      rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad);

      if ((rc != PBSE_NONE) && 
          (rc != PBSE_PERM))
        {
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        req_reject(rc, bad, preq, NULL, NULL);

        delete iter;

        return;
        }
      }  /* END for (pjob != NULL) */

    delete iter;

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
   
    reply_send_svr(preq);
    }

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
示例#28
0
文件: req_stat.c 项目: hocks/torque
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  int        job_state = -1;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      if (job_state == JOB_STATE_RUNNING)
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) ||
              (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
            {
            pthread_mutex_unlock(poll_job_task_mutex);
            snprintf(log_buf, sizeof(log_buf),
              "Job %s missing MOM's information. Skipping polling on this job", pjob->ji_qs.ji_jobid);
            log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
            } 
          else
            {
            current_poll_job_tasks++;
            pthread_mutex_unlock(poll_job_task_mutex);

            stat_mom_job(job_id);

            pthread_mutex_lock(poll_job_task_mutex);
            current_poll_job_tasks--;
            }
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
示例#29
0
void process_hold_reply(

  batch_request *preq)

  {
  job                  *pjob;
  pbs_attribute         temphold;

  int                   newstate;
  int                   newsub;
  int                   rc;
  char                 *pset;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* preq was handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    if (preq->rq_reply.brp_code != 0)
      {
      rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset, &temphold);
      
      if (rc == 0)
        {
        rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold],
            &temphold, DECR);
        }
      
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */
      
      pjob->ji_modified = 1;    /* indicate attributes changed */
      svr_evaljobstate(*pjob, newstate, newsub, 0);
      svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
      
      if (preq->rq_reply.brp_code != PBSE_NOSUP)
        {
        sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code);
        log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf);
        }
      else
        {
        reply_ack(preq);
        }
      }
    else
      {
      /* record that MOM has a checkpoint file */
      
      /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
       * And if these flags are not set, start_exec will not try to run the job from
       * the checkpoint image file.
       */
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
      
      if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
        {
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
        pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
        }

      pjob->ji_modified = 1;    /* indicate attributes changed     */
      
      svr_evaljobstate(*pjob, newstate, newsub, 0);
      svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */
      
      account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
      reply_ack(preq);
      }
    }

  } /* END process_hold_reply() */
示例#30
0
void *req_checkpointjob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  pbs_attribute *pattr;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory");
      }

    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        pjob = NULL;
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  return(NULL);
  }  /* END req_checkpointjob() */