Example #1
0
int status_job(

  job                  *pjob,    /* ptr to job to status */
  struct batch_request *preq,
  svrattrl             *pal,     /* specific attributes to status */
  tlist_head           *pstathd, /* RETURN: head of list to append status to */
  int                  *bad)     /* RETURN: index of first bad pbs_attribute */

  {
  struct brp_status *pstat;
  int                IsOwner = 0;
  long               query_others = 0;

  /* see if the client is authorized to status this job */
  if (svr_authorize_jobreq(preq, pjob) == 0)
    IsOwner = 1;

  get_svr_attr_l(SRV_ATR_query_others, &query_others);
  if (!query_others)
    {
    if (IsOwner == 0)
      {
      return(PBSE_PERM);
      }
    }

  /* allocate reply structure and fill in header portion */
  if ((pstat = calloc(1, sizeof(struct brp_status))) == NULL)
    {
    return(PBSE_SYSTEM);
    }

  CLEAR_LINK(pstat->brp_stlink);

  pstat->brp_objtype = MGR_OBJ_JOB;

  strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid);

  CLEAR_HEAD(pstat->brp_attr);

  append_link(pstathd, &pstat->brp_stlink, pstat);

  /* add attributes to the status reply */
  *bad = 0;

  if (status_attrib(
        pal,
        job_attr_def,
        pjob->ji_wattr,
        JOB_ATR_LAST,
        preq->rq_perm,
        &pstat->brp_attr,
        bad,
        IsOwner))
    {
    return(PBSE_NOATTR);
    }

  return (0);
  }  /* END status_job() */
Example #2
0
static int forced_jobpurge(

  job                  *pjob,
  struct batch_request *preq)

  {
  long owner_purge = FALSE;
  /* check about possibly purging the job */
  if (preq->rq_extend != NULL)
    {
    if (!strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      get_svr_attr_l(SRV_ATR_OwnerPurge, &owner_purge);

      if (((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) != 0) ||
          ((svr_chk_owner(preq, pjob) == 0) && (owner_purge)))
        {
        force_purge_work(pjob);

        return(PURGE_SUCCESS);
        }
      else
        {
        /* FAILURE */
        req_reject(PBSE_PERM, 0, preq, NULL, NULL);

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return(-1);
        }
      }
    }

  return(PBSE_NONE);
  }  /* END forced_jobpurge() */
Example #3
0
/**
 * poll_job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  long       job_stat_rate;

  free(ptask->wt_mutex);
  free(ptask);

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      int       job_state = -1;

      job_state = pjob->ji_qs.ji_state;

      // only do things for running jobs
      if (job_state == JOB_STATE_RUNNING)
        {
        job_mutex.unlock();

        get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate);

        if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate)
          {
          get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
          if (poll_jobs)
            stat_mom_job(job_id);
          }

        /* add another task */
        set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }
  }  /* END poll_job_task() */
Example #4
0
static void job_delete_nanny(

  struct work_task *pwt)

  {
  job                  *pjob;
  char                 *sigk = "SIGKILL";
  char                 *jobid;

  struct batch_request *newreq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  time_t                time_now = time(NULL);
  long                  nanny = FALSE;

  /* short-circuit if nanny isn't enabled */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    jobid = (char *)pwt->wt_parm1;
    
    if (jobid != NULL)
      {
      pjob = svr_find_job(jobid, FALSE);
      
      if (pjob != NULL)
        {
        sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid);
        log_err(-1, "job nanny", log_buf);
        
        /* build up a Signal Job batch request */
        if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
          {
          strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
          snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk);
          }
        
        issue_signal(&pjob, sigk, post_job_delete_nanny, newreq);
        
        if (pjob != NULL)
          {
          apply_job_delete_nanny(pjob, time_now + 60);
  
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }
        }
      }
    else
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory");
      }
    }
  
  if (pwt->wt_parm1 != NULL)
    free(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END job_delete_nanny() */
void post_job_delete_nanny(

  batch_request *preq_sig)

  {
  int                   rc;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  nanny = 0;

  if (preq_sig == NULL)    
    return;

  rc       = preq_sig->rq_reply.brp_code;

  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    /* the admin disabled nanny within the last minute or so */
    free_br(preq_sig);

    return;
    }

  /* extract job id from task */
  pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE);

  if (pjob == NULL)
    {
    sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);
    }
  else if (rc == PBSE_UNKJOBID)
    {
    sprintf(log_buf, "job delete nanny returned, but does not exist on mom");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);

    free_nodes(pjob);

    set_resc_assigned(pjob, DECR);
  
    free_br(preq_sig);

    svr_job_purge(pjob);

    return;
    }
  
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  /* free task */
  free_br(preq_sig);

  return;
  } /* END post_job_delete_nanny() */
Example #6
0
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  int        job_state = -1;

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
      if ((poll_jobs) && (job_state == JOB_STATE_RUNNING))
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          current_poll_job_tasks++;
          pthread_mutex_unlock(poll_job_task_mutex);

          stat_mom_job(job_id);

          pthread_mutex_lock(poll_job_task_mutex);
          current_poll_job_tasks--;
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
Example #7
0
int set_slot_limit(

  char      *request, /* I */
  job_array *pa)      /* O */

  {
  char *pcnt;
  long  max_limit;

  /* check for a max slot limit */
  if (get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit) != PBSE_NONE)
    max_limit = NO_SLOT_LIMIT;

  if ((pcnt = strchr(request,'%')) != NULL)
    {
    /* remove '%' from the request, or else it can't be parsed */
    while (*pcnt == '%')
      {
      *pcnt = '\0';
      pcnt++;
      }

    /* read the number if one is given */
    if (strlen(pcnt) > 0)
      {
      pa->ai_qs.slot_limit = atoi(pcnt);
      if ((max_limit != NO_SLOT_LIMIT) &&
          (max_limit < pa->ai_qs.slot_limit))
        {
        return(INVALID_SLOT_LIMIT);
        }
      }
    else
      {
      pa->ai_qs.slot_limit = max_limit;
      }
    }
  else
    {
    pa->ai_qs.slot_limit = max_limit;
    }

  return(0);
  } /* END set_slot_limit() */
int apply_job_delete_nanny(

  struct job *pjob,
  int         delay)  /* I */

  {
  enum work_type    tasktype;
  long              nanny = FALSE;

  /* short-circuit if nanny isn't enabled or we have a delete nanny */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if ((nanny == FALSE) ||
      (pjob->ji_has_delete_nanny == TRUE))
    {
    return(PBSE_NONE);
    }

  if (delay == 0)
    {
    tasktype = WORK_Immed;
    }
  else if (delay > 0)
    {
    tasktype = WORK_Timed;
    }
  else
    {
    log_err(-1, __func__, "negative delay requested for nanny");

    return(-1);
    }

  pjob->ji_has_delete_nanny = TRUE;

  /* add a nanny task at the requested time */
  set_task(tasktype, delay, job_delete_nanny, strdup(pjob->ji_qs.ji_jobid), FALSE);

  return(PBSE_NONE);
  } /* END apply_job_delete_nanny() */
std::string get_path_jobdata(

  const char *jobid,
  const char *basepath)

  {
  std::string ret_path("");
  long        use_jobs_subdirs = FALSE;

  if ((jobid == NULL) || (basepath == NULL))
    return(ret_path);

  ret_path = basepath;

  // get use_jobs_subdirs value if set 
  get_svr_attr_l(SRV_ATR_use_jobs_subdirs, &use_jobs_subdirs);

  // if we are using divided subdirectories in server_priv/{jobs,arrays}
  //  then adjust path
  if ((use_jobs_subdirs == TRUE) && isdigit(*jobid))
    {
    char *p = (char *)jobid + 1;

    // point p to the first non-digit in string
    while (isdigit(*p))
      p++;

    // move back 1 char to the last digit of the job id
    p--;

    // append the last digit of the numeric part of the job id on the string
    ret_path.push_back(*p);

    // append slash
    ret_path.push_back('/');
    }

  return(ret_path);
  }
Example #10
0
int  can_queue_new_job(

  char *user_name,
  job  *pjob)

  {
  long         max_queuable = -1;
  int          can_queue_another = TRUE;
  unsigned int num_queued = 0;
  unsigned int num_to_add;

  get_svr_attr_l(SRV_ATR_MaxUserQueuable, &max_queuable);

  if (max_queuable >= 0)
    {
    num_to_add = count_jobs_submitted(pjob);
    num_queued = get_num_queued(&users, user_name);

    if (num_queued + num_to_add > (unsigned int)max_queuable)
      can_queue_another = FALSE;
    }
  
  return(can_queue_another);
  } /* END can_queue_new_job() */
Example #11
0
int modify_job_attr(

  job      *pjob,  /* I (modified) */
  svrattrl *plist, /* I */
  int       perm,
  int      *bad)   /* O */

  {
  int            allow_unkn = -1;
  long           i;
  pbs_attribute  newattr[JOB_ATR_LAST];
  pbs_attribute *pattr;
  int            rc;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue     *pque;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    if (pque->qu_qs.qu_type != QTYPE_Execution)
      allow_unkn = JOB_ATR_UNKN;

    unlock_queue(pque, __func__, NULL, LOGLEVEL);
    }
  else if (pjob->ji_parent_job != NULL)
    {
    allow_unkn = JOB_ATR_UNKN;
    }
  else
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 5");
    return(PBSE_JOBNOTFOUND);
    }

  pattr = pjob->ji_wattr;

  /* call attr_atomic_set to decode and set a copy of the attributes */

  rc = attr_atomic_set(
         plist,        /* I */
         pattr,        /* I */
         newattr,      /* O */
         job_attr_def, /* I */
         JOB_ATR_LAST,
         allow_unkn,   /* I */
         perm,         /* I */
         bad);         /* O */

  /* if resource limits are being changed ... */

  if ((rc == 0) &&
      (newattr[JOB_ATR_resource].at_flags & ATR_VFLAG_SET))
    {
    if ((perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)
      {
      /* If job is running, only manager/operator can raise limits */

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        {
        long lim = TRUE;
        int comp_resc_lt;
       
        get_svr_attr_l(SRV_ATR_QCQLimits, &lim);
        comp_resc_lt = comp_resc2(&pjob->ji_wattr[JOB_ATR_resource],
                                      &newattr[JOB_ATR_resource],
                                      lim,
                                      NULL,
                                      LESS);

        if (comp_resc_lt != 0)
          {
          rc = PBSE_PERM;
          }
        }

      /* Also check against queue and system limits */

      if (rc == 0)
        {
        if ((pque = get_jobs_queue(&pjob)) != NULL)
          {
          rc = chk_resc_limits( &newattr[JOB_ATR_resource], pque, NULL);
          unlock_queue(pque, __func__, NULL, LOGLEVEL);
          }
        else if (pjob == NULL)
          {
          log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 6");
          return(PBSE_JOBNOTFOUND);
          }
        else
          rc = PBSE_QUENOTAVAILABLE;
        }
      }
    }    /* END if ((rc == 0) && ...) */

  /* special check on permissions for hold */

  if ((rc == 0) &&
      (newattr[JOB_ATR_hold].at_flags & ATR_VFLAG_MODIFY))
    {
    i = newattr[JOB_ATR_hold].at_val.at_long ^
        (pattr + JOB_ATR_hold)->at_val.at_long;

    rc = chk_hold_priv(i, perm);
    }

  if (rc == 0)
    {
    for (i = 0;i < JOB_ATR_LAST;i++)
      {
      if (newattr[i].at_flags & ATR_VFLAG_MODIFY)
        {
        if (job_attr_def[i].at_action)
          {
          rc = job_attr_def[i].at_action(
                 &newattr[i],
                 pjob,
                 ATR_ACTION_ALTER);

          if (rc)
            break;
          }
        }
      }    /* END for (i) */

    if ((rc == 0) &&
        ((newattr[JOB_ATR_userlst].at_flags & ATR_VFLAG_MODIFY) ||
         (newattr[JOB_ATR_grouplst].at_flags & ATR_VFLAG_MODIFY)))
      {
      /* need to reset execution uid and gid */

      rc = set_jobexid(pjob, newattr, NULL);
      }

    if ((rc == 0) &&
        (newattr[JOB_ATR_outpath].at_flags & ATR_VFLAG_MODIFY))
      {
      /* need to recheck if JOB_ATR_outpath is a special case of host only */

      if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == ':')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_outpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'o');

        /* don't call free_dynamic_string() */
        free(ds);
        }
      /*
       * if the output path was specified and ends with a '/'
       * then append the standard file name
       */
      else if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == '/')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] = '\0';
        
        replace_attr_string(&newattr[JOB_ATR_outpath],
          (add_std_filename(pjob, newattr[JOB_ATR_outpath].at_val.at_str, (int)'o', ds)));

        /* don't call free_dynamic_string because() we still want to use the allocated string */
        free(ds);
        }
      }

    if ((rc == 0) &&
        (newattr[JOB_ATR_errpath].at_flags & ATR_VFLAG_MODIFY))
      {
      /* need to recheck if JOB_ATR_errpath is a special case of host only */

      if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == ':')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_errpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'e');

        /* don't call free_dynamic_string() */
        free(ds);
        }
      /*
       * if the error path was specified and ends with a '/'
       * then append the standard file name
       */
      else if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == '/')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] = '\0';
        
        replace_attr_string(&newattr[JOB_ATR_errpath],
          (add_std_filename(pjob, newattr[JOB_ATR_errpath].at_val.at_str,(int)'e', ds)));

        /* don't call free_dynamic_string() */
        free(ds);
        }
      }

    }  /* END if (rc == 0) */

  if (rc != 0)
    {
    for (i = 0;i < JOB_ATR_LAST;i++)
      job_attr_def[i].at_free(newattr + i);

    /* FAILURE */

    return(rc);
    }  /* END if (rc != 0) */

  /* OK, now copy the new values into the job attribute array */

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    if (newattr[i].at_flags & ATR_VFLAG_MODIFY)
      {
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "attr %s modified", job_attr_def[i].at_name);

        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }

      job_attr_def[i].at_free(pattr + i);

      if ((newattr[i].at_type == ATR_TYPE_LIST) ||
          (newattr[i].at_type == ATR_TYPE_RESC))
        {
        list_move(
          &newattr[i].at_val.at_list,
          &(pattr + i)->at_val.at_list);
        }
      else
        {
        *(pattr + i) = newattr[i];
        }

      (pattr + i)->at_flags = newattr[i].at_flags;
      }
    }    /* END for (i) */

  /* note, the newattr[] attributes are on the stack, they go away automatically */

  pjob->ji_modified = 1;

  return(0);
  }  /* END modify_job_attr() */
Example #12
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  int                    iter;
  time_t                 time_now = time(NULL);
  long                   poll_jobs = 0;
  char                   job_id[PBS_MAXSVRJOBID+1];
  int                    job_substate = -1;
  time_t                 job_momstattime = -1;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  iter = -1;

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (!poll_jobs)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = svr_find_job(cntl->sc_jobid, FALSE);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);
          }
        else if (type == tjstQueue)
          {
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                break;
                }
              }

            job_array_index++;
            }
          }
        else
          {
          pjob = next_job(&alljobs,&iter);
          }

        }    /* END if (pjob == NULL) */
      else
        {
        strcpy(job_id, pjob->ji_qs.ji_jobid);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
        else if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
                break;
                }
              }
            }
          }
        else
          pjob = next_job(&alljobs,&iter);
          
        }

      if (pjob == NULL)
        break;

      strcpy(job_id, pjob->ji_qs.ji_jobid);
      job_substate = pjob->ji_qs.ji_substate;
      job_momstattime = pjob->ji_momstat;
      strcpy(cntl->sc_jobid, job_id);
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      pjob = NULL;

      /* PBS_RESTAT_JOB defaults to 30 seconds */
      if ((job_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - job_momstattime) > JobStatRate))
        {
        /* go to MOM for status */
        if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC)
          break;

        if (rc != 0)
          {
          pjob = svr_find_job(job_id, FALSE);

          rc = 0;

          continue;
          }
        
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (rc != 0)
      {
      if (pa != NULL)
        unlock_ai_mutex(pa, __func__, "2", LOGLEVEL);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,&iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,&iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,&iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }


  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    int  iter = -1;

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      iter = -1;

      while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          if (pa != NULL)
            {
            unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
            }
          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);
          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */
      
    if (pa != NULL)
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

    reply_send_svr(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          unlock_queue(pque, __func__, "not exec", LOGLEVEL);
        
          goto nextjob;
          }

        unlock_queue(pque, __func__, "exec", LOGLEVEL);
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,&iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,&iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Example #13
0
int req_stat_job(

  struct batch_request *preq)  /* ptr to the decoded request */

  {
  struct stat_cntl     *cntl; /* see svrfunc.h  */
  char                 *name;
  job                  *pjob = NULL;
  pbs_queue            *pque = NULL;
  int                   rc = PBSE_NONE;
  long                  poll_jobs = 0;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  enum TJobStatTypeEnum type = tjstNONE;

  /*
   * first, validate the name of the requested object, either
   * a job, a queue, or the whole server.
   */
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "note");
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }


  /* FORMAT:  name = { <JOBID> | <QUEUEID> | '' } */

  name = preq->rq_ind.rq_status.rq_id;

  if (preq->rq_extend != NULL)
    {
    /* evaluate pbs_job_stat() 'extension' field */

    if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated")))
      {
      /* truncate response by 'max_report' */

      type = tjstTruncatedServer;
      }
    else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays")))
      {
      type = tjstSummarizeArraysServer;
      }

    }    /* END if (preq->rq_extend != NULL) */

  if (isdigit((int)*name))
    {
    /* status a single job */

    if (is_array(name))
      {
      if (type != tjstSummarizeArraysServer)
        {
        type = tjstArray;
        }
      }
    else
      {
      type = tjstJob;

      if ((pjob = svr_find_job(name, FALSE)) == NULL)
        {
        rc = PBSE_UNKJOBID;
        }
      else
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    }
  else if (isalpha(name[0]))
    {
    if (type == tjstNONE)
      type = tjstQueue;
    else if (type == tjstSummarizeArraysServer)
      type = tjstSummarizeArraysQueue;
    else
      type = tjstTruncatedQueue;

    /* if found, this mutex is released later */
    if ((pque = find_queuebyname(name)) == NULL)
      {
      rc = PBSE_UNKQUE;
      }
    }
  else if ((*name == '\0') || (*name == '@'))
    {
    /* status all jobs at server */

    if (type == tjstNONE)
      type = tjstServer;
    }
  else
    {
    rc = PBSE_IVALREQ;
    }

  if (rc != 0)
    {
    /* is invalid - an error */
    req_reject(rc, 0, preq, NULL, NULL);

    return(rc);
    }

  preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preq->rq_reply.brp_un.brp_status);

  cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl));

  if (cntl == NULL)
    {
    if (pque != NULL) 
      unlock_queue(pque, "req_stat_job", "no memory cntl", LOGLEVEL);
    req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return(PBSE_SYSTEM);
    }

  if ((type == tjstTruncatedQueue) ||
      (type == tjstTruncatedServer))
    {
    if (pque != NULL)
      {
      unlock_queue(pque, __func__, "", LOGLEVEL);
      pque = NULL;
      }
    }

  cntl->sc_type   = (int)type;
  cntl->sc_conn   = -1;
  cntl->sc_pque   = pque;
  cntl->sc_origrq = preq;
  cntl->sc_post   = req_stat_job_step2;
  cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (poll_jobs)
    cntl->sc_post = 0; /* we're not going to make clients wait */

  req_stat_job_step2(cntl); /* go to step 2, see if running is current */

  if (pque != NULL)
    unlock_queue(pque, "req_stat_job", "success", LOGLEVEL);

  free(cntl);
  return(PBSE_NONE);
  }  /* END req_stat_job() */
Example #14
0
void svr_mailowner(

  job   *pjob,      /* I */
  int    mailpoint, /* note, single character  */
  int    force,     /* if set to MAIL_FORCE, force mail delivery */
  const char  *text)      /* (optional) additional message text */

  {
  static const char   *memory_err = "Cannot allocate memory to send email";

  char                  mailto[1024];
  char                 *domain = NULL;
  int                   i;
  mail_info            *mi;
  long                  no_force = FALSE;

  struct array_strings *pas;
  memset(mailto, 0, sizeof(mailto));

  get_svr_attr_str(SRV_ATR_MailDomain, &domain);
  if ((domain != NULL) &&
      (!strcasecmp("never", domain)))
    {
    /* never send user mail under any conditions */
    if (LOGLEVEL >= 3) 
      {
      log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "Not sending email: Mail domain set to 'never'\n");
      }

    return;
    }

  if (LOGLEVEL >= 3)
    {
    char tmpBuf[LOG_BUF_SIZE];

    snprintf(tmpBuf, LOG_BUF_SIZE, "preparing to send '%c' mail for job %s to %s (%.64s)\n",
             (char)mailpoint,
             pjob->ji_qs.ji_jobid,
             pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str,
             (text != NULL) ? text : "---");

    log_event(
      PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      tmpBuf);
    }

  /*
   * if force is true, force the mail out regardless of mailpoint
   * unless server no_mail_force attribute is set to true
   */
  get_svr_attr_l(SRV_ATR_NoMailForce, &no_force);

  if ((force != MAIL_FORCE) ||
      (no_force == TRUE))
    {

    if (pjob->ji_wattr[JOB_ATR_mailpnts].at_flags & ATR_VFLAG_SET)
      {
      if (*(pjob->ji_wattr[JOB_ATR_mailpnts].at_val.at_str) ==  MAIL_NONE)
        {
        /* do not send mail. No mail requested on job */
        log_event(PBSEVENT_JOB,
                  PBS_EVENTCLASS_JOB,
                  pjob->ji_qs.ji_jobid,
                  "Not sending email: job requested no e-mail");
        return;
        }
      /* see if user specified mail of this type */
      if (strchr(
            pjob->ji_wattr[JOB_ATR_mailpnts].at_val.at_str,
            mailpoint) == NULL)
        {
        /* do not send mail */
        log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          "Not sending email: User does not want mail of this type.\n");

        return;
        }
      }
    else if (mailpoint != MAIL_ABORT) /* not set, default to abort */
      {
      log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "Not sending email: Default mailpoint does not include this type.\n");

      return;
      }
    }

  mi = (mail_info *)calloc(1, sizeof(mail_info));

  if (mi == NULL)
    {
    log_err(ENOMEM, __func__, memory_err);
    return;
    }

  /* Who does the mail go to?  If mail-list, them; else owner */
  mailto[0] = '\0';

  if (pjob->ji_wattr[JOB_ATR_mailuser].at_flags & ATR_VFLAG_SET)
    {
    /* has mail user list, send to them rather than owner */

    pas = pjob->ji_wattr[JOB_ATR_mailuser].at_val.at_arst;

    if (pas != NULL)
      {
      for (i = 0;i < pas->as_usedptr;i++)
        {
        if ((strlen(mailto) + strlen(pas->as_string[i]) + 2) < sizeof(mailto))
          {
          strcat(mailto, pas->as_string[i]);
          strcat(mailto, " ");
          }
        }
      }
    }
  else
    {
    /* no mail user list, just send to owner */

    if (domain != NULL)
      {
      snprintf(mailto, sizeof(mailto), "%s@%s",
        pjob->ji_wattr[JOB_ATR_euser].at_val.at_str, domain);

      if (LOGLEVEL >= 5) 
        {
        char tmpBuf[LOG_BUF_SIZE];

        snprintf(tmpBuf,sizeof(tmpBuf),
          "Updated mailto from job owner and mail domain: '%s'\n",
          mailto);
        log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          tmpBuf);
        }
      }
    else
      {
#ifdef TMAILDOMAIN
      snprintf(mailto, sizeof(mailto), "%s@%s",
        pjob->ji_wattr[JOB_ATR_euser].at_val.at_str, TMAILDOMAIN);
#else /* TMAILDOMAIN */
      snprintf(mailto, sizeof(mailto), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
#endif /* TMAILDOMAIN */

      if (LOGLEVEL >= 5)
        {
        char tmpBuf[LOG_BUF_SIZE];

        snprintf(tmpBuf,sizeof(tmpBuf),
          "Updated mailto from job owner: '%s'\n",
          mailto);
        log_event(PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          tmpBuf);
        }
      }
    }

  /* initialize the mail information */

  if ((mi->mailto = strdup(mailto)) == NULL)
    {
    log_err(ENOMEM, __func__, memory_err);
    free(mi);
    return;
    }

  mi->mail_point = mailpoint;

  if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {
    mi->exec_host = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);

    if (mi->exec_host == NULL)
      {
      log_err(ENOMEM, __func__, memory_err);
      free(mi);
      return;
      }
    }
  else
    mi->exec_host = NULL;

  if ((mi->jobid = strdup(pjob->ji_qs.ji_jobid)) == NULL)
    {
    log_err(ENOMEM, __func__, memory_err);
    free(mi);
    return;
    }

  if (pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str != NULL)
    {
    mi->jobname = strdup(pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str);

    if (mi->jobname == NULL)
      {
      log_err(ENOMEM, __func__, memory_err);
      free(mi);
      return;
      }
    }
  else
    mi->jobname = NULL;

  if (text)
    {
    if ((mi->text = strdup(text)) == NULL)
      {
      free(mi);
      log_err(ENOMEM, __func__, memory_err);
      return;
      }
    }
  else
    mi->text = NULL;

  /* have a thread do the work of sending the mail */
  enqueue_threadpool_request(send_the_mail,mi);

  return;
  }  /* END svr_mailowner() */
Example #15
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
int process_request(

  struct tcp_chan *chan) /* file descriptor (socket) to get request */

  {
  int                   rc = PBSE_NONE;
  struct batch_request *request = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  acl_enable = FALSE;
  long                  state = SV_STATE_DOWN;

  time_t                time_now = time(NULL);
  int                   free_request = TRUE;
  char                  tmpLine[MAXLINE];
  char                 *auth_err = NULL;
  enum conn_type        conn_active;
  unsigned short        conn_socktype;
  unsigned short        conn_authen;
  unsigned long         conn_addr;
  int                   sfds = chan->sock;

  pthread_mutex_lock(svr_conn[sfds].cn_mutex);
  conn_active = svr_conn[sfds].cn_active;
  conn_socktype = svr_conn[sfds].cn_socktype;
  conn_authen = svr_conn[sfds].cn_authen;
  conn_addr = svr_conn[sfds].cn_addr;
  svr_conn[sfds].cn_lasttime = time_now;
  pthread_mutex_unlock(svr_conn[sfds].cn_mutex);

  if ((request = alloc_br(0)) == NULL)
    {
    snprintf(tmpLine, sizeof(tmpLine),
        "cannot allocate memory for request from %lu",
        conn_addr);
    req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_SYSTEM;
    goto process_request_cleanup;
    }

  request->rq_conn = sfds;

  /*
   * Read in the request and decode it to the internal request structure.
   */
  if (conn_active == FromClientDIS || conn_active == ToServerDIS)
    {
#ifdef ENABLE_UNIX_SOCKETS

    if ((conn_socktype & PBS_SOCK_UNIX) &&
        (conn_authen != PBS_NET_CONN_AUTHENTICATED))
      {
      /* get_creds interestingly always returns 0 */
      get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname);
      }

#endif /* END ENABLE_UNIX_SOCKETS */
    rc = dis_request_read(chan, request);
    }
  else
    {
    char out[80];

    snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", 
                conn_active,conn_socktype, netaddr_long(conn_addr, out));
    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST,
      "process_req", tmpLine);
    snprintf(tmpLine, sizeof(tmpLine),
        "request on invalid type of connection (%d) from %s",
        conn_active,
        netaddr_long(conn_addr, out));
    req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_BADHOST;
    goto process_request_cleanup;
    }

  if (rc == -1)
    {
    /* FAILURE */
    /* premature end of file */
    rc = PBSE_PREMATURE_EOF;
    goto process_request_cleanup;
    }

  if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE))
    {
    /* FAILURE */
    /* read error, likely cannot send reply so just disconnect */
    /* ??? not sure about this ??? */
    goto process_request_cleanup;
    }

  if (rc > 0)
    {
    /* FAILURE */

    /*
     * request didn't decode, either garbage or unknown
     * request type, in either case, return reject-reply
     */

    req_reject(rc, 0, request, NULL, "cannot decode message");
    free_request = FALSE;
    goto process_request_cleanup;
    }

  if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0)
    {
    sprintf(log_buf, "%s: %lu",
      pbse_to_txt(PBSE_BADHOST),
      conn_addr);

    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf);

    snprintf(tmpLine, sizeof(tmpLine),
        "cannot determine hostname for connection from %lu",
        conn_addr);

    req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_BADHOST;
    goto process_request_cleanup;
    }

  if (LOGLEVEL >= 1)
    {
    sprintf(log_buf,
      msg_request,
      reqtype_to_txt(request->rq_type),
      request->rq_user,
      request->rq_host,
      sfds);

    log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf);
    }

  /* is the request from a host acceptable to the server */
  if (conn_socktype & PBS_SOCK_UNIX)
    {
    strcpy(request->rq_host, server_name);
    }

  get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable);
  if (acl_enable)
    {
    /* acl enabled, check it; always allow myself and nodes */
    struct array_strings *pas = NULL;
    struct pbsnode       *isanode;

    get_svr_attr_arst(SRV_ATR_acl_hosts, &pas);
    isanode = PGetNodeFromAddr(conn_addr);

    if ((isanode == NULL) &&
        (strcmp(server_host, request->rq_host) != 0) &&
        (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0))
      {
      char tmpLine[MAXLINE];
      snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s",
               request->rq_host);

      req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
      free_request = FALSE;
      rc = PBSE_BADHOST;
      goto process_request_cleanup;
      }

    if (isanode != NULL)
      unlock_node(isanode, "process_request", NULL, LOGLEVEL);
    }

  /*
   * determine source (user client or another server) of request.
   * set the permissions granted to the client
   */

  if (conn_authen == PBS_NET_CONN_FROM_PRIVIL)
    {
    /* request came from another server */

    request->rq_fromsvr = 1;

    request->rq_perm =
      ATR_DFLAG_USRD | ATR_DFLAG_USWR |
      ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
      ATR_DFLAG_MGRD | ATR_DFLAG_MGWR |
      ATR_DFLAG_SvWR;
    }
  else
    {
    /* request not from another server */
    conn_credent[sfds].timestamp = time_now;

    request->rq_fromsvr = 0;

    /*
     * Client must be authenticated by an Authenticate User Request, if not,
     * reject request and close connection.  -- The following is retained for
     * compat with old cmds -- The exception to this is of course the Connect
     * Request which cannot have been authenticated, because it contains the
     * needed ticket; so trap it here.  Of course, there is no prior
     * authentication on the Authenticate User request either, but it comes
     * over a reserved port and appears from another server, hence is
     * automatically granted authentication.
     *
     * The above is only true with inet sockets.  With unix domain sockets, the
     * user creds were read before the first dis_request_read call above.
     * We automatically granted authentication because we can trust the socket
     * creds.  Authorization is still granted in svr_get_privilege below
     */

    if (request->rq_type == PBS_BATCH_Connect)
      {
      req_connect(request);

      if (conn_socktype == PBS_SOCK_INET)
        {
        rc = PBSE_IVALREQ;
        req_reject(rc, 0, request, NULL, NULL);
        free_request = FALSE;
        goto process_request_cleanup;
        }

      }

    if (conn_socktype & PBS_SOCK_UNIX)
      {
      pthread_mutex_lock(svr_conn[sfds].cn_mutex);
      svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED;
      pthread_mutex_unlock(svr_conn[sfds].cn_mutex);
      }

    if (ENABLE_TRUSTED_AUTH == TRUE )
      rc = PBSE_NONE;  /* bypass the authentication of the user--trust the client completely */
    else if (munge_on)
      {
      /* If munge_on is true we will validate the connection now */
      if (request->rq_type == PBS_BATCH_AltAuthenUser)
        {
        rc = req_altauthenuser(request);
        free_request = FALSE;
        goto process_request_cleanup;
        }
      else
        {
        rc = authenticate_user(request, &conn_credent[sfds], &auth_err);
        }
      }
    else if (conn_authen != PBS_NET_CONN_AUTHENTICATED)
      /* skip checking user if we did not get an authenticated credential */
      rc = PBSE_BADCRED;
    else
      rc = authenticate_user(request, &conn_credent[sfds], &auth_err);

    if (rc != 0)
      {
      req_reject(rc, 0, request, NULL, auth_err);
      if (auth_err != NULL)
        free(auth_err);
      free_request = FALSE;
      goto process_request_cleanup;
      }

    /*
     * pbs_mom and checkpoint restart scripts both need the authority to do
     * alters and releases on checkpointable jobs.  Allow manager permission
     * for root on the jobs execution node.
     */
     
    if (((request->rq_type == PBS_BATCH_ModifyJob) ||
        (request->rq_type == PBS_BATCH_ReleaseJob)) &&
        (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0))
      {
      job *pjob;
      char *dptr;
      int skip = FALSE;
      char short_host[PBS_MAXHOSTNAME+1];

      /* make short host name */

      strcpy(short_host, request->rq_host);
      if ((dptr = strchr(short_host, '.')) != NULL)
        {
        *dptr = '\0';
        }
      
      if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0)
        {
        if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
          {

          if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) &&
              ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) ||
               (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) ||
               (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) &&
              (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL))
            {
            request->rq_perm = svr_get_privilege(request->rq_user, server_host);
            skip = TRUE;
            }

          }
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }
      
      if (!skip)
        {
        request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host);
        }
      }
    else
      {
      request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host);
      }
    }  /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */

  /* if server shutting down, disallow new jobs and new running */
  get_svr_attr_l(SRV_ATR_State, &state);

  if (state > SV_STATE_RUN)
    {
    switch (request->rq_type)
      {
      case PBS_BATCH_AsyrunJob:
      case PBS_BATCH_JobCred:
      case PBS_BATCH_MoveJob:
      case PBS_BATCH_QueueJob:
      case PBS_BATCH_RunJob:
      case PBS_BATCH_StageIn:
      case PBS_BATCH_jobscript:

        req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL);
        rc = PBSE_SVRDOWN;
        free_request = FALSE;
        goto process_request_cleanup;
        /*NOTREACHED*/

        break;
      }
    }

  /*
   * dispatch the request to the correct processing function.
   * The processing function must call reply_send() to free
   * the request struture.
   */

  rc = dispatch_request(sfds, request);

  return(rc);

process_request_cleanup:

  if (free_request == TRUE)
    free_br(request);

  return(rc);
  }  /* END process_request() */
int process_status_info(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char     *name = nd_name;
  struct pbsnode *current;
  long            mom_job_sync = FALSE;
  long            auto_np = FALSE;
  long            down_on_error = FALSE;
  int             dont_change_state = FALSE;
  pbs_attribute   temp;
  int             rc = PBSE_NONE;
  bool            send_hello = false;

  get_svr_attr_l(SRV_ATR_MomJobSync, &mom_job_sync);
  get_svr_attr_l(SRV_ATR_AutoNodeNP, &auto_np);
  get_svr_attr_l(SRV_ATR_DownOnError, &down_on_error);

  /* Before filling the "temp" pbs_attribute, initialize it.
   * The second and third parameter to decode_arst are never
   * used, so just leave them empty. (GBS) */
  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if original node cannot be found do not process the update */
  if ((current = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  //A node we put to sleep is up and running.
  if (current->nd_power_state != POWER_STATE_RUNNING)
    {
    //Make sure we wait for a stray update that came after we changed the state to pass
    //by.
    if((current->nd_power_state_change_time + NODE_POWER_CHANGE_TIMEOUT) < time(NULL))
      {
      current->nd_power_state = POWER_STATE_RUNNING;
      write_node_power_state();
      }
    }

  /* loop over each string */
  for (unsigned int i = 0; i != status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();
    /* these two options are for switching nodes */
    if (!strncmp(str, NUMA_KEYWORD, strlen(NUMA_KEYWORD)))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);
      
      dont_change_state = FALSE;

      if ((current = get_numa_from_str(str, current)) == NULL)
        break;
      else
        continue;
      }
    else if (!strncmp(str, "node=", strlen("node=")))
      {
      /* if we've already processed some, save this before moving on */
      if (i != 0)
        save_node_status(current, &temp);

      dont_change_state = FALSE;

      if ((current = get_node_from_str(str, name, current)) == NULL)
        break;
      else
        {
        if (current->nd_mom_reported_down == TRUE)
          {
          /* There is a race condition if using a mom hierarchy and manually
           * shutting down a non-level 1 mom: if its message that the mom is
           * shutting down gets there before its last status update, the node
           * can incorrectly be set as free again. For that reason, only set
           * a mom back up if its reporting for itself. */
          if (strcmp(name, str + strlen("node=")) != 0)
            dont_change_state = TRUE;
          else
            current->nd_mom_reported_down = FALSE;
          }

        continue;
        }
      }

    /* add the info to the "temp" pbs_attribute */
    else if (!strcmp(str, START_GPU_STATUS))
      {
      is_gpustat_get(current, i, status_info);
      str = status_info[i].c_str();
      }
    else if (!strcmp(str, START_MIC_STATUS))
      {
      process_mic_status(current, i, status_info);
      str = status_info[i].c_str();
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "layout", 6))
      {
      if (current->nd_layout == NULL)
        {
        current->nd_layout = new Machine(status_info[i]);
        }

      continue;
      }
#endif
    else if (!strcmp(str, "first_update=true"))
      {
      /* mom is requesting that we send the mom hierarchy file to her */
      //remove_hello(&hellos, current->nd_id);
      send_hello = true;
      
      /* reset gpu data in case mom reconnects with changed gpus */
      clear_nvidia_gpus(current);
      }
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      DBPRT(("is_stat_get: cannot add attributes\n"));

      free_arst(&temp);

      break;
      }

    if (!strncmp(str, "state", 5))
      {
      if (dont_change_state == FALSE)
        process_state_str(current, str);
      }
    else if ((allow_any_mom == TRUE) &&
             (!strncmp(str, "uname", 5))) 
      {
      process_uname_str(current, str);
      }
    else if (!strncmp(str, "me", 2))  /* shorter str compare than "message" */
      {
      if ((!strncmp(str, "message=ERROR", 13)) &&
          (down_on_error == TRUE))
        {
        update_node_state(current, INUSE_DOWN);
        dont_change_state = TRUE;
        set_note_error(current, str);
        }
      }
    else if (!strncmp(str,"macaddr=",8))
      {
      update_node_mac_addr(current,str + 8);
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobdata=", 8)))
      {
      /* update job attributes based on what the MOM gives us */      
      update_job_data(current, str + strlen("jobdata="));
      }
    else if ((mom_job_sync == TRUE) &&
             (!strncmp(str, "jobs=", 5)))
      {
      /* walk job list reported by mom */
      size_t         len = strlen(str) + strlen(current->nd_name) + 2;
      char          *jobstr = (char *)calloc(1, len);
      sync_job_info *sji = (sync_job_info *)calloc(1, sizeof(sync_job_info));

      if ((jobstr != NULL) &&
          (sji != NULL))
        {
        sprintf(jobstr, "%s:%s", current->nd_name, str+5);
        sji->input = jobstr;
        sji->timestamp = time(NULL);

        /* sji must be freed in sync_node_jobs */
        enqueue_threadpool_request(sync_node_jobs, sji, task_pool);
        }
      else
        {
        if (jobstr != NULL)
          {
          free(jobstr);
          }
        if (sji != NULL)
          {
          free(sji);
          }
        }
      }
    else if (auto_np)
      {
      if (!(strncmp(str, "ncpus=", 6)))
        {
        handle_auto_np(current, str);
        }
      }
    } /* END processing strings */

  if (current != NULL)
    {
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }
  
  if ((rc == PBSE_NONE) &&
      (send_hello == true))
    rc = SEND_HELLO;
    
  return(rc);
  } /* END process_status_info() */
int process_alps_status(

  char           *nd_name,
  dynamic_string *status_info)

  {
  char           *str;
  char           *ccu_p = NULL;
  char           *current_node_id = NULL;
  char            node_index_buf[MAXLINE];
  int             node_index = 0;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
  int             rc;
  pbs_attribute   temp;
  hash_table_t   *rsv_ht;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  memset(&temp, 0, sizeof(temp));

  if ((rc = decode_arst(&temp, NULL, NULL, NULL, 0)) != PBSE_NONE)
    {
    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_NODE, __func__, "cannot initialize attribute");
    return(rc);
    }

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* keep track of reservations so that they're only processed once per update */
  rsv_ht = create_hash(INITIAL_RESERVATION_HOLDER_SIZE);

  /* loop over each string */
  for (str = status_info->str; str != NULL && *str != '\0'; str += strlen(str) + 1)
    {
    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (str != status_info->str)
        {
        snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
        decode_arst(&temp, NULL, NULL, node_index_buf, 0);
        
        if (current != NULL)
          save_node_status(current, &temp);
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        continue;
      }

    if (current == NULL)
      continue;

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      process_gpu_status(current, &str);
      
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      char *just_rsv_id = str + strlen(reservation_id);

      if (get_value_hash(rsv_ht, just_rsv_id) == -1)
        {
        add_hash(rsv_ht, 1, strdup(just_rsv_id));

        /* sub-functions will attempt to lock a job, so we must unlock the
         * reporter node */
        unlock_node(parent, __func__, NULL, LOGLEVEL);

        process_reservation_id(current, str);

        current_node_id = strdup(current->nd_name);
        unlock_node(current, __func__, NULL, LOGLEVEL);

        /* re-lock the parent */
        if ((parent = find_nodebyname(nd_name)) == NULL)
          {
          /* reporter node disappeared - this shouldn't be possible */
          log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation");
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL)
          {
          /* current node disappeared, this shouldn't be possible either */
          unlock_node(parent, __func__, NULL, LOGLEVEL);
          snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation",
            current_node_id);
          log_err(PBSE_UNKNODE, __func__, log_buf);
          free_arst(&temp);
          free_all_keys(rsv_ht);
          free_hash(rsv_ht);
          free(current_node_id);
          return(PBSE_NONE);
          }

        free(current_node_id);
        current_node_id = NULL;
        }
      }
    /* save this as is to the status strings */
    else if ((rc = decode_arst(&temp, NULL, NULL, str, 0)) != PBSE_NONE)
      {
      free_arst(&temp);
      free_all_keys(rsv_ht);
      free_hash(rsv_ht);
      return(rc);
      }

    /* perform any special processing */
    if (!strncmp(str, ccu_eq, ac_ccu_eq_len))
      {
      /* save compute unit count in case we need it */
      /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */
      /*  for the node */
      ccu_p = str;
      }
    else if (!strncmp(str, cproc_eq, ac_cproc_eq_len))
      {
      int ncpus;
      long svr_nppcu_value = 0;

      /*
       * Get the server nppcu value which determines how Hyper-Threaded
       * cores are reported. When server nppcu value is:
       *
       *  0 - Let ALPS choose whether or not to use Hyper-Threaded cores 
       *      (report all cores)
       *  1 - Do not use Hyper-Threaded cores
       *      (report only physical core (compute unit count)
       *  2 - Use Hyper-Threaded cores
       *      (report all cores)
       */
      get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value);

      if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL)
        {
        /* no HT (nppcu==1), so use compute unit count */
        ncpus = atoi(ccu_p + ac_ccu_eq_len);

        /* use CPROC value if we are using APBASIL protocol < 1.3 */
        if (ncpus == 0)
          ncpus = atoi(str + ac_cproc_eq_len);

        /* reset the pointer */
        ccu_p = NULL;
        }
      else
        {
        /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */
        ncpus = atoi(str + ac_cproc_eq_len);
        }

      set_ncpus(current, parent, ncpus);
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }

    } /* END processing the status update */

  if (current != NULL)
    {
    snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++);
    decode_arst(&temp, NULL, NULL, node_index_buf, 0);
    save_node_status(current, &temp);
    unlock_node(current, __func__, NULL, LOGLEVEL);
    }

  unlock_node(parent, __func__, NULL, LOGLEVEL);

  free_all_keys(rsv_ht);
  free_hash(rsv_ht);

  return(PBSE_NONE);
  } /* END process_alps_status() */
Example #19
0
int setup_array_struct(
    
  job *pjob)

  {
  job_array          *pa;
  array_request_node *rn;

  int                 bad_token_count;
  int                 array_size;
  int                 rc;
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  long                max_array_size;

    pa = (job_array *)calloc(1,sizeof(job_array));

  pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION;
  
  strcpy(pa->ai_qs.parent_id, pjob->ji_qs.ji_jobid);
  strcpy(pa->ai_qs.fileprefix, pjob->ji_qs.ji_fileprefix);
  snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s", get_variable(pjob, pbs_o_host));

  pa->ai_qs.num_cloned = 0;
  CLEAR_HEAD(pa->request_tokens);

  pa->ai_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(pa->ai_mutex,NULL);
  lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);

  if (job_save(pjob, SAVEJOB_FULL, 0) != 0)
    {
    /* the array is deleted in svr_job_purge */
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    svr_job_purge(pjob);
    /* Does job array need to be removed? */

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
        "cannot save job");
      }

    return(1);
    }

  if ((rc = set_slot_limit(pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str, pa)))
    {
    long max_limit = 0;
    get_svr_attr_l(SRV_ATR_MaxSlotLimit, &max_limit);
    array_delete(pa);

    snprintf(log_buf,sizeof(log_buf),
      "Array %s requested a slot limit above the max limit %ld, rejecting\n",
      pa->ai_qs.parent_id,
      max_limit);

    log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,pa->ai_qs.parent_id,log_buf);

    return(INVALID_SLOT_LIMIT);
    }

  pa->ai_qs.jobs_running = 0;
  pa->ai_qs.num_started = 0;
  pa->ai_qs.num_failed = 0;
  pa->ai_qs.num_successful = 0;
  
  bad_token_count = parse_array_request(
                      pjob->ji_wattr[JOB_ATR_job_array_request].at_val.at_str,
                      &(pa->request_tokens));

  /* get the number of elements that should be allocated in the array */
  rn = (array_request_node *)GET_NEXT(pa->request_tokens);
  array_size = 0;
  pa->ai_qs.num_jobs = 0;
  while (rn != NULL) 
    {
    if (rn->end > array_size)
      array_size = rn->end;
    /* calculate the actual number of jobs (different from array size) */
    pa->ai_qs.num_jobs += rn->end - rn->start + 1;

    rn = (array_request_node *)GET_NEXT(rn->request_tokens_link);
    }

  /* size of array is the biggest index + 1 */
  array_size++; 

  if (get_svr_attr_l(SRV_ATR_MaxArraySize, &max_array_size) == PBSE_NONE)
    {
    if (max_array_size < pa->ai_qs.num_jobs)
      {
      array_delete(pa);

      return(ARRAY_TOO_LARGE);
      }
    }

  /* initialize the array */
  pa->job_ids = calloc(array_size, sizeof(char *));
  if (pa->job_ids == NULL)
    {
    sprintf(log_buf, "Failed to alloc job_ids: job %s", pjob->ji_qs.ji_jobid);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return(PBSE_MEM_MALLOC);
    }


  /* remember array_size */
  pa->ai_qs.array_size = array_size;

  CLEAR_HEAD(pa->ai_qs.deps);

  array_save(pa);

  if (bad_token_count > 0)
    {
    array_delete(pa);
    return 2;
    }

  pjob->ji_arraystruct = pa;

  insert_array(pa);

  unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

  return(PBSE_NONE);
  } /* END setup_array_struct() */
Example #20
0
int finalize_rerunjob(struct batch_request *preq,job *pjob,int rc)
  {
  int     Force;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case -1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Example #21
0
int status_job(

  job           *pjob, /* ptr to job to status */
  batch_request *preq,
  svrattrl      *pal, /* specific attributes to status */
  tlist_head    *pstathd, /* RETURN: head of list to append status to */
  bool           condensed,
  int           *bad) /* RETURN: index of first bad pbs_attribute */

  {
  struct brp_status *pstat;
  int                IsOwner = 0;
  long               query_others = 0;
  long               condensed_timeout = JOB_CONDENSED_TIMEOUT;

  /* Make sure procct is removed from the job 
     resource attributes */
  remove_procct(pjob);

  /* see if the client is authorized to status this job */
  if (svr_authorize_jobreq(preq, pjob) == 0)
    IsOwner = 1;

  get_svr_attr_l(SRV_ATR_query_others, &query_others);
  if (!query_others)
    {
    if (IsOwner == 0)
      {
      return(PBSE_PERM);
      }
    }
  
  get_svr_attr_l(SRV_ATR_job_full_report_time, &condensed_timeout);

  // if the job has been modified within the timeout, send the full output
  if ((condensed == true) &&
      (time(NULL) < pjob->ji_mod_time + condensed_timeout))
    condensed = false;

  /* allocate reply structure and fill in header portion */
  if ((pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status))) == NULL)
    {
    return(PBSE_SYSTEM);
    }

  CLEAR_LINK(pstat->brp_stlink);

  pstat->brp_objtype = MGR_OBJ_JOB;

  strcpy(pstat->brp_objname, pjob->ji_qs.ji_jobid);

  CLEAR_HEAD(pstat->brp_attr);

  append_link(pstathd, &pstat->brp_stlink, pstat);

  /* add attributes to the status reply */

  *bad = 0;

  if (status_attrib(
        pal,
        job_attr_def,
        pjob->ji_wattr,
        JOB_ATR_LAST,
        preq->rq_perm,
        &pstat->brp_attr,
        condensed,
        bad,
        IsOwner))
    {
    return(PBSE_NOATTR);
    }

  return (0);
  }  /* END status_job() */
Example #22
0
void rerun_or_kill(

  job  **pjob_ptr, /* I (modified/freed) */
  char  *text)     /* I */

  {
  long       server_state = SV_STATE_DOWN;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue *pque;
  job       *pjob = *pjob_ptr;

  get_svr_attr_l(SRV_ATR_State, &server_state);
  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(&pjob, "SIGKILL", free_br, NULL);

    if (pjob != NULL)
      {
      pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
      if ((pque = get_jobs_queue(&pjob)) != NULL)
        {
        snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text);

        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        }
      }
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);

    job_abt(pjob_ptr, log_buf);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text);
    }

  if (pjob != NULL)
    {
    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
    }

  return;
  }  /* END rerun_or_kill() */
Example #23
0
/*
 * get_correct_jobname() - makes sure the job searches for the correct name
 * necessary because of SRV_ATR_display_job_server_suffix and
 * SRV_ATR_job_suffix_alias
 *
 * allocs the correct job name
 * @param jobid (I) - the jobid as passed in (NUM.SERVER_NAME)
 * @return a pointer to the correct job name (alloc'd)
 */
char *get_correct_jobname(

  const char *jobid) /* I */

  {
  char *correct = NULL;
  char *dot;
  char *work_jobid = strdup(jobid);
  /* first suffix could be the server name or the alias */
  char *first_suffix = NULL;

  /* second suffix can only be the alias */
  char *second_suffix = NULL;
  int   server_suffix = TRUE;

  int len;

  long  display_suffix = TRUE;
  char *alias = NULL;

  get_svr_attr_l(SRV_ATR_display_job_server_suffix, &display_suffix);
  if (display_suffix == FALSE)
    server_suffix = FALSE;

  if ((dot = strchr((char *)work_jobid,'.')) != NULL)
    {
    first_suffix = dot + 1;

    if ((dot = strchr(first_suffix,'.')) != NULL)
      {
      second_suffix = dot + 1;
      }
  
    dot = NULL;
    }

  /* check current settings */
  get_svr_attr_str(SRV_ATR_job_suffix_alias, &alias);
  if ((alias != NULL) &&
      (server_suffix == TRUE))
    {
    /* display the server suffix and the alias */

    /* check if alias is already there */
    if (second_suffix != NULL)
      {
      if (strcmp(second_suffix,alias) == 0)
        {
        correct = strdup(jobid);

        if (correct == NULL)
          log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
      
        free(work_jobid);

        return(correct);
        }
      }
    else if (first_suffix == NULL)
      {
      /* alloc memory and sprint, add 3 for 2 '.' and NULL terminator */
      len = strlen(work_jobid) + strlen(server_name) + strlen(alias) + 3;
      correct = (char *)calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }

      snprintf(correct,len,"%s.%s.%s",
        work_jobid,server_name,alias);
      }
    else
      {
      /* add 2 for null terminator and '.' */
      len = strlen(alias) + 2 + strlen(work_jobid);

      correct = (char *)calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",work_jobid,alias);
      }
    } /* END if (server_suffix && alias) */
  else if (server_suffix == TRUE)
    {
    /* just the server suffix */

    if (first_suffix != NULL)
      {
      correct = strdup(work_jobid);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }
      }
    else
      {
      len = strlen(work_jobid) + strlen(server_name) + 2;

      correct = (char *)calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",
        work_jobid,server_name);
      }
    } /* END if (just server_suffix) */
  else if (alias != NULL)
    {
    /* just the alias, not the server */

    if (first_suffix == NULL)
      {
      len = strlen(work_jobid) + strlen(alias) + 2;

      correct = (char *)calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",work_jobid,alias);
      }
    else
      {
      len = strlen(alias) + 2;

      dot = first_suffix - 1;
      *dot = '\0';

      len += strlen(work_jobid);
      correct = (char *)calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
        free(work_jobid);
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",
        work_jobid,
        alias);

      *dot = '.';
      }
    } /* END else if (just alias) */
  else
    {
    /* no server suffix nor alias */
    if (first_suffix != NULL)
      {
      dot = first_suffix - 1;
      *dot = '\0';
      }

    len = strlen(work_jobid) + 1;
    correct = (char *)calloc(1, len);

    if (correct == NULL)
      {
      log_err(-1, __func__, "ERROR:    Fatal - Cannot allocate memory\n");
      free(work_jobid);
      return(NULL);
      }

    snprintf(correct,len,"%s",work_jobid);

    if (first_suffix != NULL)
      *dot = '.';
    }

  free(work_jobid);

  return(correct);
  } /* END get_correct_jobname() */
Example #24
0
void svr_shutdown(

  int type) /* I */

  {
  pbs_attribute *pattr;
  job           *pjob;
  long           state = SV_STATE_DOWN;
  int            iter;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  close(lockfds);

  save_queues();

  /* Lets start by logging shutdown and saving everything */
  get_svr_attr_l(SRV_ATR_State, &state);

  strcpy(log_buf, msg_shutdown_start);

  if (state == SV_STATE_SHUTIMM)
    {
    /* if already shuting down, another Immed/sig will force it */
    if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG))
      {
      state = SV_STATE_DOWN;
      set_svr_attr(SRV_ATR_State, &state);

      strcat(log_buf, "Forced");

      log_event(
        PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
        PBS_EVENTCLASS_SERVER,
        msg_daemonname,
        log_buf);

      return;
      }
    }

  if (type == SHUT_IMMEDIATE)
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Immediate");
    }
  else if (type == SHUT_DELAY)
    {
    state = SV_STATE_SHUTDEL;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Delayed");
    }
  else if (type == SHUT_QUICK)
    {
    state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "Quick");
    }
  else
    {
    state = SV_STATE_SHUTIMM;
    set_svr_attr(SRV_ATR_State, &state);

    strcat(log_buf, "By Signal");
    }

  log_event(
    PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
    PBS_EVENTCLASS_SERVER,
    msg_daemonname,
    log_buf);

  if ((type == SHUT_QUICK) || (type == SHUT_SIG)) /* quick, leave jobs as are */
    {
    return;
    }

  svr_save(&server, SVR_SAVE_QUICK);

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL)
    {
    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
      {
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART | JOB_SVFLG_HASRUN;

      pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

      if ((pattr->at_flags & ATR_VFLAG_SET) &&
          ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
           (csv_find_string(pattr->at_val.at_str, "shutdown") != NULL)))
        {
        /* do checkpoint of job */

        if (shutdown_checkpoint(&pjob) == 0)
          {
          if (pjob != NULL)
            unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

          continue;
          }
        }

      /* if no checkpoint (not supported, not allowed, or fails */
      /* rerun if possible, else kill job */

      rerun_or_kill(&pjob, msg_on_shutdown);
      }

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    }

  return;
  }  /* END svr_shutdown() */
Example #25
0
int authenticate_user(

  struct batch_request *preq,  /* I */
  struct credential    *pcred,
  char   **autherr) /* O */

  {
  int    rc;
  char   uath[PBS_MAXUSER + PBS_MAXHOSTNAME + 1];
  time_t time_now = time(NULL);
  char   error_msg[1024];
  bool   acl_enabled = false;

#ifdef MUNGE_AUTH
 
  if (strncmp(preq->rq_user, pcred->username, PBS_MAXUSER))
    {
    /* extra check for munge */
    struct array_strings *my_acl = NULL;
    char uh[PBS_MAXUSER + PBS_MAXHOSTNAME + 2];

    sprintf(uh, "%s@%s", preq->rq_user, pcred->hostname);
    
    get_svr_attr_arst(SRV_ATR_authusers, &my_acl); 
    if ((acl_check_my_array_string(my_acl, uh, ACL_User_Host)) == 0)
      {
      *autherr = strdup("User not in authorized user list.");
      sprintf(error_msg, "%s Requested user %s: requested from host %s",
                     *autherr, preq->rq_user, preq->rq_host);
      log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg);
      return(PBSE_BADCRED);
      }
    }
#else
  if (strncmp(preq->rq_user, pcred->username, PBS_MAXUSER))
    {
    *autherr = strdup("Users do not match");
    sprintf(error_msg, "%s: Requested user %s: credential user %s: requested from host %s",
                   *autherr, preq->rq_user, pcred->username, preq->rq_host);
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg);
    return(PBSE_BADCRED);
    }
#endif

  if (strncmp(preq->rq_host, pcred->hostname, PBS_MAXHOSTNAME))
    {
    struct sockaddr_in *sai1;
    struct sockaddr_in *sai2;
    struct addrinfo    *addr_info1 = NULL;
    struct addrinfo    *addr_info2 = NULL;

    sai1 = get_cached_addrinfo(preq->rq_host);
    sai2 = get_cached_addrinfo(pcred->hostname);

    if ((sai1 == NULL) &&
        (pbs_getaddrinfo(preq->rq_host, NULL, &addr_info1) == PBSE_NONE))
      {
      sai1 = (struct sockaddr_in *)addr_info1->ai_addr;
      }

    if ((sai2 == NULL) &&
        (pbs_getaddrinfo(pcred->hostname, NULL, &addr_info2) == PBSE_NONE))
      {
      sai2 = (struct sockaddr_in *)addr_info2->ai_addr;
      }

    if ((sai1 == NULL) ||
        (sai2 == NULL) ||
        (memcmp(sai1, sai2, sizeof(struct sockaddr_in))))
      {
      *autherr = strdup("Hosts do not match");
      
      sprintf(error_msg, "%s: Requested host %s: credential host: %s",
        *autherr, preq->rq_host, pcred->hostname);
      log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, error_msg);
    
      return(PBSE_BADCRED);
      }
    }

  if (pcred->timestamp)
    {
    long lifetime = 0;
    if (get_svr_attr_l(SRV_ATR_CredentialLifetime, &lifetime) == PBSE_NONE)
      {
      /* use configured value if set */
      }
    else 
      {
      /* if not use the default */
      lifetime = CREDENTIAL_LIFETIME;
      }

    /* negative values mean that credentials have an infinite lifetime */
    if (lifetime > -1)
      {
      if ((pcred->timestamp - CREDENTIAL_TIME_DELTA > time_now) ||
          (pcred->timestamp + lifetime < time_now))
        {
        return(PBSE_EXPIRED);
        }
      }

    }

  /* If Server's Acl_User enabled, check if user in list */
  get_svr_attr_b(SRV_ATR_AclUserEnabled, &acl_enabled);
  if (acl_enabled)
    {
    struct array_strings *acl_users = NULL;
    snprintf(uath, sizeof(uath), "%s@%s", preq->rq_user, preq->rq_host);
    
    get_svr_attr_arst(SRV_ATR_AclUsers, &acl_users);
    if (acl_check_my_array_string(acl_users, uath, ACL_User) == 0)
      {
      int       my_err;
      pbs_net_t connect_addr = get_hostaddr(&my_err, preq->rq_host);
      pbs_net_t server_addr = get_hostaddr(&my_err, server_host);

#ifdef __CYGWIN__

      if ((!IamAdminByName(preq->rq_user)) || 
          (connect_addr != server_addr))
        {
        return(PBSE_PERM);
        }
#else /* __CYGWIN__ */
#ifdef PBS_ROOT_ALWAYS_ADMIN
      if ((strcmp(preq->rq_user, PBS_DEFAULT_ADMIN) != 0) ||
          (connect_addr != server_addr))
        {
        return(PBSE_PERM);
        }

#else /* PBS_ROOT_ALWAYS_ADMIN */
      return(PBSE_PERM);

#endif /* PBS_ROOT_ALWAYS_ADMIN */
#endif /* __CYGWIN__ */

      }
    }

  /* A site stub for additional checking */

  rc = site_allow_u(preq->rq_user, preq->rq_host);

  return(rc);
  }  /* END authenticate_user() */
Example #26
0
/**
 * update_array_values()
 *
 * updates internal bookeeping values for job arrays
 * @param pa - array to update
 * @param pjob - the pjob that an event happened on
 * @param event - code for what event just happened
 */
void update_array_values(

  job_array            *pa,        /* I */
  int                   old_state, /* I */
  enum ArrayEventsEnum  event,     /* I */
  char                 *job_id,
  long                  job_atr_hold,
  int                   job_exit_status)

  {
  long  moab_compatible;

  switch (event)
    {
    case aeQueue:

      /* NYI, nothing needs to be done for this yet */

      break;

    case aeRun:

      if (old_state != JOB_STATE_RUNNING)
        {
        pa->ai_qs.jobs_running++;
        pa->ai_qs.num_started++;
        }

      break;

    case aeTerminate:

      if (old_state == JOB_STATE_RUNNING)
        {
        if (pa->ai_qs.jobs_running > 0)
          pa->ai_qs.jobs_running--;
        }

      if (job_exit_status == 0)
        {
        pa->ai_qs.num_successful++;
        pa->ai_qs.jobs_done++;
        }
      else
        {
        pa->ai_qs.num_failed++;
        pa->ai_qs.jobs_done++;
        }

      array_save(pa);

      /* update slot limit hold if necessary */
      if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE)
        moab_compatible = FALSE;

      if (moab_compatible != FALSE)
        {
        /* only need to update if the job wasn't previously held */
        if ((job_atr_hold & HOLD_l) == FALSE)
          {
          int  i;
          int  newstate;
          int  newsub;
          job *pj;

          /* find the first held job and release its hold */
          for (i = 0; i < pa->ai_qs.array_size; i++)
            {
            if (pa->job_ids[i] == NULL)
              continue;

            if (!strcmp(pa->job_ids[i], job_id))
              continue;

            if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL)
              {
              free(pa->job_ids[i]);
              pa->job_ids[i] = NULL;
              }
            else
              {
              if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
                {
                pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
                
                if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                  {
                  pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                  }
                
                svr_evaljobstate(pj, &newstate, &newsub, 1);
                svr_setjobstate(pj, newstate, newsub, FALSE);
                job_save(pj, SAVEJOB_FULL, 0);
                unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
                
                break;
                }

              unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
              }
            }
          }
        }

      break;

    default:

      /* log error? */

      break;
    }

  set_array_depend_holds(pa);
  array_save(pa);

  } /* END update_array_values() */
Example #27
0
int acct_job(

  job            *pjob, /* I */
  dynamic_string *ds)   /* O */

  {
  int         rc;
  long        cray_enabled = FALSE;
  int         resc_access_perm = READ_ONLY;
  char        local_buf[MAXLINE*4];
  pbs_queue  *pque;

  tlist_head  attrlist;
  svrattrl   *pal;

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  CLEAR_HEAD(attrlist);

  /* user */

	/* acct_job is only called from account_jobstr and account_jobend. BufSize should be
	 	 PBS_ACCT_MAX_RCD + 1 in size. */
  sprintf(local_buf, "user=%s ",
    pjob->ji_wattr[JOB_ATR_euser].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* group */
  sprintf(local_buf, "group=%s ",
    pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* account */
  if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "account=%s ",
      pjob->ji_wattr[JOB_ATR_account].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

  /* job name */
  sprintf(local_buf, "jobname=%s ",
    pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    /* queue name */
    sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name);
    unlock_queue(pque, __func__, NULL, LOGLEVEL);

    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }
  else if (pjob == NULL)
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1");
    return(PBSE_JOBNOTFOUND);
    }

  /* create time */
  sprintf(local_buf, "ctime=%ld ",
    pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* queued time */
  sprintf(local_buf, "qtime=%ld ",
    pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* eligible time, how long ready to run */
  sprintf(local_buf, "etime=%ld ",
    pjob->ji_wattr[JOB_ATR_etime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* execution start time */
  sprintf(local_buf, "start=%ld ",
    (long)pjob->ji_qs.ji_stime);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* user */
  sprintf(local_buf, "owner=%s ",
    pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);
 
  /* For large clusters strings can get pretty long. We need to see if there
     is a need to allocate a bigger buffer */
  /* execution host name */
  if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {
    append_dynamic_string(ds, "exec_host=");
    append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);
    }

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if ((cray_enabled == TRUE) &&
      (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET))
    {
    append_dynamic_string(ds, "login_node=");
    append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);
    }

  /* now encode the job's resource_list pbs_attribute */
  job_attr_def[JOB_ATR_resource].at_encode(
    &pjob->ji_wattr[JOB_ATR_resource],
    &attrlist,
    job_attr_def[JOB_ATR_resource].at_name,
    NULL,
    ATR_ENCODE_CLIENT,
    resc_access_perm);

  while ((pal = GET_NEXT(attrlist)) != NULL)
    {
		/* exec_host can use a lot of buffer space. Use a dynamic string */
    append_dynamic_string(ds, pal->al_name);

    if (pal->al_resc != NULL)
      {
      append_dynamic_string(ds, ".");
      append_dynamic_string(ds, pal->al_resc);
      }

    append_dynamic_string(ds, "=");
    append_dynamic_string(ds, pal->al_value);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);

    delete_link(&pal->al_link);
    free(pal);
    }  /* END while (pal != NULL) */

#ifdef ATTR_X_ACCT

  /* x attributes */
  if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "x=%s ",
            pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

#endif

  /* SUCCESS */

  return(PBSE_NONE);
  }  /* END acct_job() */
int set_jobexid(

  job           *pjob,    /* I */
  pbs_attribute *attrry,  /* I */
  char          *EMsg)    /* O (optional,minsize=1024) */

  {
  int             addflags = 0;
  pbs_attribute  *pattr;
  char          **pmem;

  struct group   *gpent;
  int             free_puser = FALSE;
  char           *puser = NULL;
  char           *at;
  char           *usr_at_host = NULL;
  int             len;


  struct passwd  *pwent = NULL;
  char           *pgrpn;
  int             free_pgrpn = TRUE;
  char            gname[PBS_MAXGRPN + 1];
#ifdef _CRAY

  struct udb     *pudb;
#endif

  char            tmpLine[1024 + 1];
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  long            disable_id_check = 0;
  int             CheckID;  /* boolean */

  if (EMsg != NULL)
    EMsg[0] = '\0';

  /* use the passed User_List if set, may be a newly modified one     */
  /* if not set, fall back to the job's actual User_List, may be same */
  if (get_svr_attr_l(SRV_ATR_DisableServerIdCheck, &disable_id_check) != PBSE_NONE)
    CheckID = 1;
  else
    CheckID = !disable_id_check;

  if (CheckID == 0)
    {
    /* NOTE: use owner, not userlist - should this be changed? */
    /* Yes, changed 10/17/2007 */

    if (pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str != NULL)
      {
      /* start of change to use userlist instead of owner 10/17/2007 */

      if ((attrry + JOB_ATR_userlst)->at_flags & ATR_VFLAG_SET)
        pattr = attrry + JOB_ATR_userlst;
      else
        pattr = &pjob->ji_wattr[JOB_ATR_userlst];

      if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET)
        {
        puser = pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str;

        /* set the job's owner as the new user, appending @host if
         * the job's owner has that */
        at = strchr(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str,'@');
        len = strlen(puser) + 1;
        if (at != NULL)
          {
          len += strlen(at);
          usr_at_host = (char *)calloc(len, sizeof(char));
          snprintf(usr_at_host,len,"%s%s",
            puser,
            at);
          }
        else
          {
          usr_at_host = (char *)calloc(len, sizeof(char));

          snprintf(usr_at_host,len,"%s",
            puser);
          }

        free(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
        pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str = usr_at_host;
        }
      else if ((puser = geteusernam(pjob, pattr)) == NULL)
        {
        if (EMsg != NULL)
          snprintf(EMsg, 1024, "cannot locate user name in job");

        return(PBSE_BADUSER);
        }

      free_puser = TRUE;

      sprintf(tmpLine, "%s", puser);

      /* end of change to use userlist instead of owner 10/17/2007 */
      }
    else
      {
      strcpy(tmpLine, "???");
      }
    }  /* END if (CheckID == 0) */
  else
    {
    int perm;

    if ((attrry + JOB_ATR_userlst)->at_flags & ATR_VFLAG_SET)
      pattr = attrry + JOB_ATR_userlst;
    else
      pattr = &pjob->ji_wattr[JOB_ATR_userlst];

    free_puser = TRUE;

    if ((puser = geteusernam(pjob, pattr)) == NULL)
      {
      if (EMsg != NULL)
        snprintf(EMsg, 1024, "cannot locate user name in job");

      return(PBSE_BADUSER);
      }

    pwent = getpwnam_ext(puser);

    perm = svr_get_privilege(puser,get_variable(pjob,(char *)"PBS_O_HOST"));

    if (pwent == NULL)
      {
      snprintf(log_buf,sizeof(log_buf),
        "User %s does not exist in server password file\n",
        puser);

      log_err(errno, __func__, log_buf);

      if (EMsg != NULL)
        snprintf(EMsg,1024,"%s",log_buf);

      free(puser);

      return(PBSE_BADUSER);
      }

    if ((pwent->pw_uid == 0) ||
        (perm & ATR_DFLAG_MGWR))
      {
      struct array_strings *pas = NULL;
      get_svr_attr_arst(SRV_ATR_AclRoot, &pas);

      /* add check here for virtual user */
      if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET)
        {
        free(puser);
        free_puser = FALSE;

        puser = pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str;

        pwent = getpwnam_ext(puser);

        if (pwent == NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "User %s does not exist in server password file\n",
            puser);

          log_err(errno, __func__, log_buf);

          if (EMsg != NULL)
            snprintf(EMsg,1024,"%s",log_buf);

          return(PBSE_BADUSER);
          }

        /* set the job's owner as the new user */
        at = strchr(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str,'@');
        len = strlen(puser) + 1;
        if (at != NULL)
          {
          len += strlen(at);
          usr_at_host = (char *)calloc(len, sizeof(char));
          snprintf(usr_at_host,len,"%s%s",
            puser,
            at);
          }
        else
          {
          usr_at_host = (char *)calloc(len, sizeof(char));

          snprintf(usr_at_host,len,"%s",
            puser);
          }

        free(pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
        pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str = usr_at_host;
        }
      else if (get_svr_attr_arst(SRV_ATR_AclRoot, &pas) == PBSE_NONE)
        {
        if (acl_check_my_array_string(pas, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str, ACL_User) == 0)
          {
          if (EMsg != NULL)
            snprintf(EMsg, 1024, "root user %s fails ACL check",
                     puser);

          if (free_puser == TRUE)
            free(puser);

          return(PBSE_BADUSER); /* root not allowed */
          }
        }
      else if (pwent->pw_uid == 0)
        {
        if (EMsg != NULL)
          snprintf(EMsg, 1024, "root user %s not allowed",
                   puser);
          
        if (free_puser == TRUE)
          free(puser);

        return(PBSE_BADUSER); /* root not allowed */
        }
      }    /* END if (pwent->pw_uid == 0) */
    else if (pjob->ji_wattr[JOB_ATR_proxy_user].at_flags & ATR_VFLAG_SET)
      {
      /* cannot submit a proxy job if not root or a manager */
      if (EMsg != NULL)
        {
        snprintf(EMsg, 1024,
          "User '%s' is attempting to submit a proxy job for user '%s' but is not a manager",
          puser,
          pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str);
        }

      snprintf(log_buf, 1024,
        "User '%s' is attempting to submit a proxy job for user '%s' but is not a manager",
        puser,
        pjob->ji_wattr[JOB_ATR_proxy_user].at_val.at_str);
      log_err(PBSE_BADUSER, __func__, log_buf);
          
      if (free_puser == TRUE)
        free(puser);

      return(PBSE_BADUSER);
      }

    if (site_check_user_map(pjob, puser, EMsg, LOGLEVEL) == -1)
      {
      if (free_puser == TRUE)
        free(puser);

      return(PBSE_BADUSER);
      }

    snprintf(tmpLine, sizeof(tmpLine), "%s", puser);
    }  /* END else (CheckID == 0) */

  pattr = attrry + JOB_ATR_euser;

  job_attr_def[JOB_ATR_euser].at_free(pattr);

  job_attr_def[JOB_ATR_euser].at_decode(pattr, NULL, NULL, tmpLine, 0);

#ifdef _CRAY

  /* on cray check UDB (user data base) for permission to batch it */

  if ((pwent != NULL) && (puser != NULL))
    {
    pudb = getudbuid(pwent->pw_uid);

    endudb();

    if (pudb == UDB_NULL)
      {
      if (EMsg != NULL)
        snprintf(EMsg, 1024, "user %s not located in user data base",
                 puser);

      if (free_puser == TRUE)
        free(puser);

      return(PBSE_BADUSER);
      }

    if (pudb->ue_permbits & (PERMBITS_NOBATCH | PERMBITS_RESTRICTED))
      {
      if (free_puser == TRUE)
        free(puser);

      return(PBSE_QACESS);
      }

    /* if account (qsub -A) not specified, set default from UDB */

    pattr = attrry + JOB_ATR_account;

    if ((pattr->at_flags & ATR_VFLAG_SET) == 0)
      {
      job_attr_def[JOB_ATR_account].at_decode(
        pattr,
        NULL,
        NULL,
        (char *)acid2nam(pudb->ue_acids[0]));
      }
    }    /* END if ((pwent != NULL) && (puser != NULL)) */

#endif /* _CRAY */

  /*
   * now figure out the group name under which the job should execute
   * PBS requires that each group have an entry in the group file,
   * see the admin guide for the reason why...
   *
   * use the passed group_list if set, may be a newly modified one
   * if not set, fall back to the job's actual group_list, may be same
   */

  if ((attrry + JOB_ATR_grouplst)->at_flags & ATR_VFLAG_SET)
    pattr = attrry + JOB_ATR_grouplst;
  else
    pattr = &pjob->ji_wattr[JOB_ATR_grouplst];

  /* extract user-specified egroup if it exists */

  pgrpn = getegroup(pjob, pattr);

  if (pgrpn == NULL)
    {
    free_pgrpn = FALSE;

    if ((pwent != NULL) || ((pwent = getpwnam_ext(puser)) != NULL))
      {
      /* egroup not specified - use user login group */

      gpent = getgrgid(pwent->pw_gid);

      if (gpent != NULL)
        {
        pgrpn = gpent->gr_name;           /* use group name */
        }
      else
        {
        sprintf(gname, "%ld",
                (long)pwent->pw_gid);

        pgrpn = gname;            /* turn gid into string */
        }
      }
    else if (CheckID == 0)
      {
      strcpy(gname, "???");

      pgrpn = gname;
      }
    else
      {
      log_err(errno, __func__, (char *)"getpwnam failed");

      if (EMsg != NULL)
        snprintf(EMsg, 1024, "user does not exist in server password file");

      if (free_puser == TRUE)
        free(puser);

      return(PBSE_BADUSER);
      }

    /*
     * setting the DEFAULT flag is a "kludgy" way to keep MOM from
     * having to do an unneeded look up of the group file.
     * We needed to have JOB_ATR_egroup set for the server but
     * MOM only wants it if it is not the login group, so there!
     */

    addflags = ATR_VFLAG_DEFLT;
    }  /* END if ((pgrpn = getegroup(pjob,pattr))) */
  else if (CheckID == 0)
    {
    /* egroup specified - do not validate group within server */

    /* NO-OP */
    }
  else
    {
    /* user specified a group, group must exist and either */
    /* must be user's primary group or the user must be in it */

    gpent = getgrnam_ext(pgrpn);

    if (gpent == NULL)
      {
      if (CheckID == 0)
        {
        strcpy(gname, "???");

        pgrpn = gname;
        }
      else
        if (EMsg != NULL)
          snprintf(EMsg, 1024, "cannot locate group %s in server group file",
                   pgrpn);

      if (free_puser == TRUE)
        free(puser);

      free(pgrpn);

      return(PBSE_BADGRP);  /* no such group */
      }

    if (gpent->gr_gid != pwent->pw_gid)
      {
      /* not primary group */

      pmem = gpent->gr_mem;

      while (*pmem != NULL)
        {
        if (!strcmp(puser, *pmem))
          break;

        ++pmem;
        }

      if (*pmem == NULL)
        {
        /* requested group not allowed */
        snprintf(log_buf,sizeof(log_buf),
          "user %s is not a member of group %s in server password file",
          puser,
          pgrpn);

        log_err(-1, __func__, log_buf);

        if (EMsg != NULL)
          snprintf(EMsg, 1024, "%s",log_buf);

        if (free_puser == TRUE)
          free(puser);

        free(pgrpn);

        return(PBSE_BADGRP); /* user not in group */
        }
      }
    }    /* END if ((pgrpn = getegroup(pjob,pattr))) */

  /* set new group */

  pattr = attrry + JOB_ATR_egroup;

  job_attr_def[JOB_ATR_egroup].at_free(pattr);

  job_attr_def[JOB_ATR_egroup].at_decode(pattr, NULL, NULL, pgrpn, 0);

  pattr->at_flags |= addflags;

  /* SUCCESS */
  if (free_puser == TRUE)
    free(puser);

  if (free_pgrpn == TRUE)
    free(pgrpn);

  return(0);
  }  /* END set_jobexid() */
Example #29
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
int process_alps_status(

  const char               *nd_name,
  std::vector<std::string> &status_info)

  {
  const char    *ccu_p = NULL;
  char           *current_node_id = NULL;
  struct pbsnode *parent;
  struct pbsnode *current = NULL;
#ifdef PENABLE_LINUX_CGROUPS
  int             numa_nodes = 0;
  int             sockets = 0;
#endif
  std::string     temp;
  container::item_container<const char *> rsv_ht;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if we can't find the parent node, ignore the update */
  if ((parent = find_nodebyname(nd_name)) == NULL)
    return(PBSE_NONE);

  /* loop over each string */
  for (unsigned int i = 0; i < status_info.size(); i++)
    {
    const char *str = status_info[i].c_str();

    if (!strncmp(str, "node=", strlen("node=")))
      {
      if (i != 0)
        {
        if (current != NULL)
          save_node_status(current, temp);
      
        temp.clear();
        }

      if ((current = determine_node_from_str(str, parent, current)) == NULL)
        break;
      else
        {
#ifdef PENABLE_LINUX_CGROUPS
        sockets = 0;
        numa_nodes = 0;
#endif

        continue;
        }
      }

    if (current == NULL)
      continue;

    /* process the gpu status information separately */
    if (!strcmp(CRAY_GPU_STATUS_START, str))
      {
      process_gpu_status(current, i, status_info);
      continue;
      }
    else if (!strncmp(reservation_id, str, strlen(reservation_id)))
      {
      const char *just_rsv_id = str + strlen(reservation_id);

      rsv_ht.lock();
      if (rsv_ht.find(just_rsv_id) == NULL)
        {
        rsv_ht.insert(just_rsv_id,just_rsv_id);
        rsv_ht.unlock();

        /* sub-functions will attempt to lock a job, so we must unlock the
         * reporter node */
        parent->unlock_node(__func__, NULL, LOGLEVEL);

        process_reservation_id(current, str);

        current_node_id = strdup(current->get_name());
        current->unlock_node(__func__, NULL, LOGLEVEL);

        /* re-lock the parent */
        if ((parent = find_nodebyname(nd_name)) == NULL)
          {
          /* reporter node disappeared - this shouldn't be possible */
          log_err(PBSE_UNKNODE, __func__, "Alps reporter node disappeared while recording a reservation");
          free(current_node_id);
          return(PBSE_NONE);
          }

        if ((current = find_node_in_allnodes(parent->alps_subnodes, current_node_id)) == NULL)
          {
          /* current node disappeared, this shouldn't be possible either */
          parent->unlock_node(__func__, NULL, LOGLEVEL);
          snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation",
            current_node_id);
          log_err(PBSE_UNKNODE, __func__, log_buf);
          free(current_node_id);
          return(PBSE_NONE);
          }

        free(current_node_id);
        current_node_id = NULL;
        }
      else
        {
        rsv_ht.unlock();
        }
      }
    /* save this as is to the status strings */
    else
      {
      if (temp.size() > 0)
        temp += ",";
      temp += str;
      }

    /* perform any special processing */
    if (!strncmp(str, ccu_eq, ac_ccu_eq_len))
      {
      /* save compute unit count in case we need it */
      /* note: this string (ccu_eq (CCU=)) needs to be found before cprocs_eq (CPROCS=) */
      /*  for the node */
      ccu_p = str;
      }
    else if (!strncmp(str, cproc_eq, ac_cproc_eq_len))
      {
      int ncpus;
      long svr_nppcu_value = 0;

      /*
       * Get the server nppcu value which determines how Hyper-Threaded
       * cores are reported. When server nppcu value is:
       *
       *  0 - Let ALPS choose whether or not to use Hyper-Threaded cores 
       *      (report all cores)
       *  1 - Do not use Hyper-Threaded cores
       *      (report only physical core (compute unit count)
       *  2 - Use Hyper-Threaded cores
       *      (report all cores)
       */
      get_svr_attr_l(SRV_ATR_nppcu, &svr_nppcu_value);

      if (svr_nppcu_value == NPPCU_NO_USE_HT && ccu_p != NULL)
        {
        /* no HT (nppcu==1), so use compute unit count */
        ncpus = atoi(ccu_p + ac_ccu_eq_len);

        /* use CPROC value if we are using APBASIL protocol < 1.3 */
        if (ncpus == 0)
          ncpus = atoi(str + ac_cproc_eq_len);

        /* reset the pointer */
        ccu_p = NULL;
        }
      else
        {
        /* let ALPS choose (nppcu==0) or use HT (nppcu==2), use actual processor count */
        ncpus = atoi(str + ac_cproc_eq_len);
        }

      set_ncpus(current, parent, ncpus);

#ifdef PENABLE_LINUX_CGROUPS
      if (numa_nodes == 0)
        numa_nodes = 1;

      if ((current->nd_layout.is_initialized() == false) ||
          (current->nd_layout.getTotalThreads() != current->nd_slots.get_total_execution_slots()))
        {
        Machine m(current->nd_slots.get_total_execution_slots(), numa_nodes, sockets);
        current->nd_layout = m;
        }
#endif
      }
    else if (!strncmp(str, state, strlen(state)))
      {
      set_state(current, str);
      }
#ifdef PENABLE_LINUX_CGROUPS
    else if (!strncmp(str, "totmem", 6))
      {
      set_total_memory(current, str);
      }
    else if (!strncmp(str, numas, 10))
      {
      // 11 is strlen("numa_nodes=")
      numa_nodes = strtol(str + 11, NULL, 10);
      }
    else if (!strncmp(str, "socket", 6))
      {
      // 7 is strlen("socket=")
      sockets = strtol(str + 7, NULL, 10);
      }
#endif

    } /* END processing the status update */

  if (current != NULL)
    {
    save_node_status(current, temp);
    current->unlock_node(__func__, NULL, LOGLEVEL);
    }

  parent->unlock_node(__func__, NULL, LOGLEVEL);

  return(PBSE_NONE);
  } /* END process_alps_status() */