Exemple #1
0
job *svr_find_job(

  char *jobid,      /* I */
  int   get_subjob) /* I */

  {
  char *at;
  char *comp;
  int   different = FALSE;
  char *dash = NULL;
  char *dot = NULL;
  char  without_dash[PBS_MAXSVRJOBID + 1];

  job  *pj = NULL;

  if (LOGLEVEL >= 10)
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, jobid);

  if ((at = strchr(jobid, '@')) != NULL)
    *at = '\0'; /* strip off @server_name */

  /* jobid-0.server indicates the external sub-job of a heterogeneous
   * job. For this case we want to get jobid.server, find that, and 
   * the get the external sub-job */
  if (get_subjob == TRUE)
    {
    dot = strchr(jobid, '.');

    if (((dash = strchr(jobid, '-')) != NULL) &&
        (dot != NULL) &&
        (dash < dot))
      {
      *dash = '\0';
      snprintf(without_dash, sizeof(without_dash), "%s%s", jobid, dash + 2);
      jobid = without_dash;
      }
    else
      dash = NULL;
    }

  if ((is_svr_attr_set(SRV_ATR_display_job_server_suffix)) ||
      (is_svr_attr_set(SRV_ATR_job_suffix_alias)))
    {
    comp = get_correct_jobname(jobid);
    different = TRUE;

    if (comp == NULL)
      return(NULL);
    }
  else
    {
    comp = jobid;
    }

  if (strstr(jobid,"[]") == NULL)
    {
    /* if we're searching for the external we want find_job_by_array to 
     * return the parent, but if we're searching for the cray subjob then
     * we want find_job_by_array to return the sub job */
    pj = find_job_by_array(&alljobs, comp, (dash != NULL) ? FALSE : get_subjob);
    }

  /* when remotely routing jobs, they are removed from the 
   * regular job list first and the array summary after. 
   * Attempt to find them there if NULL
   * OR it's an array, try to find the job */
  if (pj == NULL)
    {
    /* see the comment on the above call to find_job_by_array() */
    pj = find_job_by_array(&array_summary, comp, (dash != NULL) ? FALSE : get_subjob);
    }

  if (at)
    *at = '@'; /* restore @server_name */

  if ((get_subjob == TRUE) &&
      (pj != NULL))
    {
    if (dash != NULL)
      {
      *dash = '-';
      
      if (pj->ji_external_clone != NULL)
        {
        pj = pj->ji_external_clone;
        
        lock_ji_mutex(pj, __func__, NULL, 0);
        unlock_ji_mutex(pj->ji_parent_job, __func__, NULL, 0);

        if (pj->ji_being_recycled == TRUE)
          {
          unlock_ji_mutex(pj, __func__, NULL, 0);
          pj = NULL;
          }
        }
      else
        {
        unlock_ji_mutex(pj, __func__, NULL, 0);
        pj = NULL;
        }
      }
    }

  if (different)
    free(comp);

  return(pj);  /* may be NULL */
  }   /* END svr_find_job() */
Exemple #2
0
int req_rerunjob(
   
  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     Force;
  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */

  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    static const char *rerun = "rerun";
    char              *extra = strdup(rerun);

    rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra);
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case - 1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Exemple #3
0
void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  batch_request         *preq = cntl->sc_origrq;
  svrattrl              *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);
  job                   *pjob = NULL;

  struct batch_reply    *preply = &preq->rq_reply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type = (enum TJobStatTypeEnum)cntl->sc_type;
  bool                   exec_only = false;

  int                    bad = 0;
  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  int                    job_array_index = -1;
  job_array             *pa = NULL;
  all_jobs_iterator     *iter;

  if (preq->rq_extend != NULL)
    {
    /* FORMAT:  { EXECQONLY } */
    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = true;
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    handle_truncated_qstat(exec_only, cntl->sc_condensed, preq);

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */
  else if (type == tjstJob)
    {
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

    if (pjob != NULL)
      {
      if ((rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad)))
        req_reject(rc, bad, preq, NULL, NULL);
      else
        reply_send_svr(preq);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    else
      {
      req_reject(PBSE_JOBNOTFOUND, bad, preq, NULL, NULL);
      }
    }
  else
    {
    if (type == tjstArray)
      {
      pa = get_array(preq->rq_ind.rq_status.rq_id);

      if (pa == NULL)
        {
        req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
        return;
        }
      }
    else if ((type == tjstSummarizeArraysQueue) || 
             (type == tjstSummarizeArraysServer))
      update_array_statuses();

    iter = get_correct_status_iterator(cntl);

    for (pjob = get_next_status_job(cntl, job_array_index, pa, iter);
         pjob != NULL;
         pjob = get_next_status_job(cntl, job_array_index, pa, iter))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      /* go ahead and build the status reply for this job */
      if (pjob->ji_being_recycled == true)
        continue;

      if (exec_only)
        {
        if (cntl->sc_pque != NULL)
          {
          if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
            continue;
          }
        else if (in_execution_queue(pjob, pa) == false)
          continue;
        }

      rc = status_job(pjob, preq, pal, &preply->brp_un.brp_status, cntl->sc_condensed, &bad);

      if ((rc != PBSE_NONE) && 
          (rc != PBSE_PERM))
        {
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        req_reject(rc, bad, preq, NULL, NULL);

        delete iter;

        return;
        }
      }  /* END for (pjob != NULL) */

    delete iter;

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
   
    reply_send_svr(preq);
    }

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
int relay_to_mom(

  job                   **pjob_ptr,
  struct batch_request   *request, /* the request to send */
  void                  (*func)(struct work_task *))

  {
  int             handle; /* a client style connection handle */
  int             rc;
  int             local_errno = 0;
  pbs_net_t       addr;
  unsigned short  port;
  job            *pjob = *pjob_ptr;
  char            jobid[PBS_MAXSVRJOBID + 1];
  char *job_momname = NULL;

  struct pbsnode *node;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if MOM is down don't try to connect */
  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
    free(job_momname);
    return(PBSE_NORELYMOM);
    }
  free(job_momname);

  if ((node != NULL) &&
      (node->nd_state & INUSE_DOWN))
    {
    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
    sprintf(log_buf, "momaddr=%s",tmp);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    free(tmp);
    }

  unlock_node(node, __func__, "after svr_connect", LOGLEVEL);
  handle = svr_connect(
           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           &local_errno,
           NULL,
           NULL,
           ToServerDIS);
    

  if (handle < 0)
    {
    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(handle, request);

  *pjob_ptr = svr_find_job(jobid, TRUE);

  return(rc);
  }  /* END relay_to_mom() */
int req_signaljob(

  void *vp)  /* I */

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(rc, 0, preq, NULL, "can not allocate memory");
    unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
void *delete_all_work(

  void *vp)

  {
  batch_request *preq = (batch_request *)vp;
  batch_request *preq_dup = duplicate_request(preq);
  job           *pjob;
  int            iter = -1;
  int            failed_deletes = 0;
  int            total_jobs = 0;
  int            rc = PBSE_NONE;
  char           tmpLine[MAXLINE];
  char          *Msg = preq->rq_extend;
  
  while ((pjob = next_job(&alljobs, &iter)) != NULL)
    {
    if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS)
      {
      continue;
      }

    if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      
      continue;
      }
    
    total_jobs++;
    
    /* mutex is freed below */
    if (rc == PBSE_NONE)
      {
      if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE)
        reply_ack(preq_dup);
       
      /* mark this as NULL because it has been freed */
      if (rc == PURGE_SUCCESS)
        {
        preq_dup = duplicate_request(preq);
        preq_dup->rq_noreply = TRUE;
        }
      else
        preq_dup = NULL;
      }
    
    if (rc != PURGE_SUCCESS)
      {
      /* duplicate the preq so we don't have a problem with double frees */
      preq_dup = duplicate_request(preq);
      preq_dup->rq_noreply = TRUE;
      
      if ((rc == MOM_DELETE) ||
          (rc == ROUTE_DELETE))
        failed_deletes++;
      }
    }
  
  if (failed_deletes == 0)
    {
    reply_ack(preq);

    /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() 
     * functions have been called */
    if (rc == PURGE_SUCCESS)
      {
      free_br(preq_dup);
      preq_dup = NULL;
      }
    }
  else
    {
    snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs",
      failed_deletes,
      total_jobs);
    
    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);
    }
    
  /* preq_dup happens at the end of the loop, so free the extra one if
   * it is there */
  if (preq_dup != NULL)
    free_br(preq_dup);

  return(NULL);
  } /* END delete_all_work() */
void finish_routing_processing(

  job *pjob,
  int  status)

  {
  int          newstate;
  int          newsub;

  if (pjob == NULL)
    return;

  if (LOGLEVEL >= 10)
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid);

  switch (status)
    {
    case LOCUTION_SUCCESS:  /* normal return, job was routed */

      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(&pjob);

      if (pjob != NULL)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
          remove_checkpoint(&pjob);

        if (pjob != NULL)
          svr_job_purge(pjob); /* need to remove server job struct */
        }

      break;

    case LOCUTION_FAIL:  /* permanent rejection (or signal) */

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT)
        {
        /* job delete in progress, just set to queued status */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        
        svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return;
        }

      add_dest(pjob);  /* else mark destination as bad */

      /* fall through */

    default: /* try routing again */
       
      svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

      /* force re-eval of job state out of Transit */

      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);

      if ((status = job_route(pjob)) == PBSE_ROUTEREJ)
        job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
      else if (status != 0)
        job_abt(&pjob, msg_routexceed);
      else
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);


      break;
    }  /* END switch (status) */

  return;
  } /* END finish_routing_processing() */
int req_holdjob(

  batch_request *vp) /* I */

  {
  long          *hold_val;
  int            newstate;
  int            newsub;
  long           old_hold;
  job           *pjob;
  char          *pset;
  int            rc;
  pbs_attribute  temphold;
  pbs_attribute *pattr;
  batch_request *preq = (struct batch_request *)vp;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  pjob = chk_job_request(preq->rq_ind.rq_hold.rq_orig.rq_objname, preq);

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* cannot do anything until we decode the holds to be set */
  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, (const char **)&pset,
                     &temphold)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    req_reject(rc, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  hold_val = &pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;

  old_hold = *hold_val;
  *hold_val |= temphold.at_val.at_long;
  pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
  sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {

    /* have MOM attempt checkpointing */

    /*
    ** The jobid in the request always have the server suffix attached
    ** which is dropped when the server attribute 
    ** 'display_job_server_suffix' is FALSE and so will in the MOM's.
    ** Therefore, it must be passed as the server to the MOM so she can
    ** find it to hold.
    */
    if (strncmp(pjob->ji_qs.ji_jobid, 
          preq->rq_ind.rq_hold.rq_orig.rq_objname, PBS_MAXSVRJOBID))
       snprintf(preq->rq_ind.rq_hold.rq_orig.rq_objname, 
          sizeof(preq->rq_ind.rq_hold.rq_orig.rq_objname), "%s", 
          pjob->ji_qs.ji_jobid);
    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(rc, 0, preq, NULL, "memory allocation failure");
      }
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      free_br(dup_req);
      *hold_val = old_hold;  /* reset to the old value */
      req_reject(rc, 0, preq, NULL, "relay to mom failed");

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        
        /* fill in log_buf again, since relay_to_mom changed it */
        sprintf(log_buf, msg_jobholdset, pset, preq->rq_user, preq->rq_host);
        
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        pjob = NULL;
        reply_ack(preq);
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_hold_reply(dup_req);
      }
    }
#ifdef ENABLE_BLCR
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * This system is configured with BLCR checkpointing to be used,
     * but this Running job does not have checkpointing enabled,
     * so we reject the request
     */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL,
      "job not held since checkpointing is expected but not enabled for job");
    }
#endif
  else
    {
    /* everything went well, may need to update the job state */
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    if (old_hold != *hold_val)
      {
      /* indicate attributes changed     */
      pjob->ji_modified = 1;

      svr_evaljobstate(*pjob, newstate, newsub, 0);

      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }

    reply_ack(preq);
    }

  return(PBSE_NONE);
  }  /* END req_holdjob() */
void *req_checkpointjob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  pbs_attribute *pattr;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
    {
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  pattr = &pjob->ji_wattr[JOB_ATR_checkpoint];

  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
      ((pattr->at_flags & ATR_VFLAG_SET) &&
       ((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
        (csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
    {
    /* have MOM attempt checkpointing */

    if ((dup_req = duplicate_request(preq)) == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, "failure to allocate memory");
      }

    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      free_br(dup_req);

      if (pjob == NULL)
        job_mutex.set_unlock_on_exit(false);
      }
    else
      {
      if (pjob != NULL)
        {
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
        
        job_save(pjob, SAVEJOB_QUICK, 0);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        pjob = NULL;
        }
      else
        job_mutex.set_unlock_on_exit(false);

      process_checkpoint_reply(dup_req);
      }
    }
  else
    {
    /* Job does not have checkpointing enabled, so reject the request */

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
    }

  return(NULL);
  }  /* END req_checkpointjob() */
Exemple #10
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job  *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];

#ifndef PBS_MOM
  char       parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "%s appears to be from an old version. Attempting to convert.\n",
      namebuf);

    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf);

      log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "3", LOGLEVEL);
      free(pj->ji_mutex);
#endif

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "4", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "5", LOGLEVEL);
    job_free(pj, FALSE);
#else
    mom_job_free(pj);
#endif


    close(fds);

    return(NULL);
    }

#ifndef PBS_MOM
  /* Comment out the mother superior tracking. Will be debugged later 
  if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {*/
    /* add job to the mother superior list for it's node */
/*    char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    char *end = strchr(ms, '/');

    if (end != NULL)
      *end = '\0';

    if ((end = strchr(ms, '+')) != NULL)
      *end = '\0';

    add_to_ms_list(ms, pj);

    free(ms);
    }*/
#endif

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        namebuf);

    log_err(-1, __func__, log_buf);
    }

#else /* not PBS_MOM */

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      close(fds);
      return NULL;
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }

#endif

  close(fds);

  pj->ji_commit_done = 1;

  /* all done recovering the job */

  job_save(pj, SAVEJOB_FULL, 0);

  return(pj);
  }  /* END job_recov() */
int  req_locatejob(

  struct batch_request *preq)

  {
  int     rc = PBSE_NONE;
  char   *at;
  int     i;
  job    *pjob;
  char   *location = (char *)0;

  if ((at = strchr(preq->rq_ind.rq_locate, (int)'@')))
    * at = '\0';  /* strip off @server_name */

  pjob = svr_find_job(preq->rq_ind.rq_locate, FALSE);

  if (pjob)
    {
    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

    location = server_name;
    }
  else
    {
    for (i = 0; i < server.sv_tracksize; i++)
      {
      if ((server.sv_track + i)->tk_mtime &&
          !strcmp((server.sv_track + i)->tk_jobid, preq->rq_ind.rq_locate))
        {
        location = (server.sv_track + i)->tk_location;
        break;
        }
      }
    }

  if (location != NULL)
    {
    preq->rq_reply.brp_code = 0;
    preq->rq_reply.brp_auxcode = 0;
    preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Locate;
    snprintf(preq->rq_reply.brp_un.brp_locate, sizeof(preq->rq_reply.brp_un.brp_locate), "%s", location);

    reply_send_svr(preq);
    }
  else
    {
    if (LOGLEVEL >= 7)
      {
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        preq->rq_ind.rq_locate,
        "cannot find job in server tracking list");
      }

    rc = PBSE_UNKJOBID;
    req_reject(rc, 0, preq, NULL, NULL);
    }

  return rc;
  }  /* END req_locatejob() */
int relay_to_mom(

    job                   **pjob_ptr,
    struct batch_request   *request, /* the request to send */
    void                  (*func)(struct work_task *))

{
    int             handle; /* a client style connection handle */
    int             rc;
    int             local_errno = 0;
    pbs_net_t       addr;
    unsigned short  port;
    job            *pjob = *pjob_ptr;
    char            jobid[PBS_MAXSVRJOBID + 1];
    char           *job_momname = NULL;

    struct pbsnode *node;
    char            log_buf[LOCAL_LOG_BUF_SIZE];
    std::string     node_name;

    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "attempting to send a request to %s's mom but no exec_host list?",
                 pjob->ji_qs.ji_jobid);
        log_err(PBSE_BADSTATE, __func__, log_buf);

        return(PBSE_BADSTATE);
    }

    /* if MOM is down don't try to connect */
    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
    job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if (job_momname == NULL)
        return PBSE_MEM_MALLOC;

    if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
        free(job_momname);
        return(PBSE_NORELYMOM);
    }
    free(job_momname);

    if ((node != NULL) &&
            ((node->nd_state & INUSE_NOT_READY)||
             (node->nd_power_state != POWER_STATE_RUNNING)))
    {
        node->unlock_node(__func__, "no relay mom", LOGLEVEL);
        return(PBSE_NORELYMOM);
    }

    if (LOGLEVEL >= 7)
    {
        char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
        sprintf(log_buf, "momaddr=%s",tmp);

        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

        free(tmp);
    }

    node_name = node->get_name();

    node->unlock_node(__func__, "after svr_connect", LOGLEVEL);

    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    *pjob_ptr = NULL;

    handle = svr_connect(addr, port, &local_errno, NULL, NULL);

    if (handle < 0)
    {
        update_failure_counts(node_name.c_str(), -1);
        log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

        return(PBSE_NORELYMOM);
    }

    request->rq_orgconn = request->rq_conn; /* save client socket */

    rc = issue_Drequest(handle, request, true);

    if (request->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
    else
        update_failure_counts(node_name.c_str(), 0);

    *pjob_ptr = svr_find_job(jobid, TRUE);

    return(rc);
}  /* END relay_to_mom() */
void process_hold_reply(

  batch_request *preq)

  {
  job                  *pjob;
  pbs_attribute         temphold;

  int                   newstate;
  int                   newsub;
  int                   rc;
  char                 *pset;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* preq was handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);

    return;
    }
  else if (preq->rq_reply.brp_code != 0)
    {

    rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold);

    if (rc == 0)
      {
      rc = job_attr_def[JOB_ATR_hold].at_set(&pjob->ji_wattr[JOB_ATR_hold],
           &temphold, DECR);
      }

    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */

    pjob->ji_modified = 1;    /* indicate attributes changed */
    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */

    if (preq->rq_reply.brp_code != PBSE_NOSUP)
      {
      sprintf(log_buf, msg_mombadhold, preq->rq_reply.brp_code);
      log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf);
      }
    else
      {
      reply_ack(preq);
      }
    }
  else
    {
    /* record that MOM has a checkpoint file */

    /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
     * And if these flags are not set, start_exec will not try to run the job from
     * the checkpoint image file.
     */

    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;

    if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
      }

    pjob->ji_modified = 1;    /* indicate attributes changed     */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub, FALSE); /* saves job */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
    reply_ack(preq);
    }

  unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

  } /* END process_hold_reply() */
int req_releasearray(

  void *vp) /* I */

  {
  job                  *pjob;
  job_array            *pa;
  char                 *range;
  int                   rc;
  int                   index;
  struct batch_request *preq = (struct batch_request *)vp;

  pa = get_array(preq->rq_ind.rq_release.rq_objname);
  if (pa == NULL)
    {
    req_reject(PBSE_IVALREQ,0,preq,NULL,"Cannot find array");
    return(PBSE_NONE);
    }

  while (TRUE)
    {
    if (((index = first_job_index(pa)) == -1) ||
        (pa->job_ids[index] == NULL))
      {
      unlock_ai_mutex(pa, __func__, (char *)"1", LOGLEVEL);

      return(PBSE_NONE);
      }

    if ((pjob = svr_find_job(pa->job_ids[index], FALSE)) == NULL)
      {
      free(pa->job_ids[index]);
      pa->job_ids[index] = NULL;
      }
    else
      break;
    }

  if (svr_authorize_jobreq(preq, pjob) == -1)
    {
    req_reject(PBSE_PERM,0,preq,NULL,NULL);

    unlock_ai_mutex(pa, __func__, (char *)"2", LOGLEVEL);
    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

    return(PBSE_NONE);
    }

  unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);

  range = preq->rq_extend;
  if ((range != NULL) &&
      (strstr(range,ARRAY_RANGE) != NULL))
    {
    /* parse the array range */
    if ((rc = release_array_range(pa,preq,range)) != 0)
      {
      unlock_ai_mutex(pa, __func__, (char *)"3", LOGLEVEL);

      req_reject(rc,0,preq,NULL,NULL);

      return(PBSE_NONE);
      }
    }
  else if ((rc = release_whole_array(pa,preq)) != 0)
    {
    unlock_ai_mutex(pa, __func__, (char *)"4", LOGLEVEL);

    req_reject(rc,0,preq,NULL,NULL);

    return(PBSE_NONE);
    }
  
  unlock_ai_mutex(pa, __func__, (char *)"5", LOGLEVEL);

  reply_ack(preq);

  return(PBSE_NONE);
  } /* END req_releasearray() */
Exemple #15
0
void purge_completed_jobs(

  struct batch_request *preq)  /* I */

  {
  job          *pjob;
  char         *time_str;
  time_t        purge_time = 0;
  int           iter;
  char          log_buf[LOCAL_LOG_BUF_SIZE];

  /* get the time to purge the jobs that completed before */
  time_str = preq->rq_extend;
  time_str += strlen(PURGECOMP);
  purge_time = strtol(time_str,NULL,10);
  
  /*
    * Clean unreported capability is only for operators and managers.
    * Check if request is authorized
  */

  if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR|
                    ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0)
    {
    req_reject(PBSE_PERM,0,preq,NULL,
      "must have operator or manager privilege to use -c parameter");
    return;
    }

  if (LOGLEVEL >= 4)
    {
    sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)",
      (long)purge_time, preq->rq_extend);

    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }
    
  reply_ack(preq);

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL) 
    {
    if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) &&
        (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) &&
        ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) &&
        (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0))
      {
      if (LOGLEVEL >= 4)
        {
        sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE",
          pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long);
        
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      
      pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1;
      pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
          
      job_save(pjob, SAVEJOB_FULL, 0); 
      }

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }


  return;
  } /* END purge_completed_jobs() */
Exemple #16
0
/*
 * delete_array_range()
 *
 * deletes a range from a specific array
 *
 * @param pa - the array whose jobs are deleted
 * @param range_str - the user-given range to delete 
 * @return - the number of jobs skipped, -1 if range error 
 */
int delete_array_range(

  job_array *pa,
  char      *range_str)

  {
  tlist_head          tl;
  array_request_node *rn;
  array_request_node *to_free;
  job                *pjob;
  char               *range;

  int                 i;
  int                 num_skipped = 0;
  int                 deleted;

  /* get just the numeric range specified, '=' should
   * always be there since we put it there in qdel */
  range = strchr(range_str,'=');
  range++; /* move past the '=' */

  CLEAR_HEAD(tl);
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't delete jobs if range error */

    return(-1);
    }

  rn = (array_request_node*)GET_NEXT(tl);

  while (rn != NULL)
    {
    for (i = rn->start; i <= rn->end; i++)
      {
      if (pa->job_ids[i] == NULL)
        continue;

      /* don't stomp on other memory */
      if (i >= pa->ai_qs.array_size)
        continue;

      if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
        {
        free(pa->job_ids[i]);
        pa->job_ids[i] = NULL;
        }
      else
        {
        if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
          {
          /* invalid state for request,  skip */
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          continue;
          }

        pthread_mutex_unlock(pa->ai_mutex);
        deleted = attempt_delete(pjob);

        if (deleted == FALSE)
          {
          /* if the job was deleted, this mutex would be taked care of elsewhere. When it fails,
           * release it here */
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

          num_skipped++;
          }

        pthread_mutex_lock(pa->ai_mutex);
        }
      }

    to_free = rn;
    rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);

    /* release mem */
    free(to_free);
    }

  return(num_skipped);
  }
Exemple #17
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  const char      *sigt = "SIGTERM";

  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if (LOGLEVEL >= 10)
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pjob->ji_qs.ji_jobid);

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    int len = strlen(log_buf);
    snprintf(log_buf + len, sizeof(log_buf) - len, "\n%s", Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    else
      job_mutex.set_lock_on_exit(false);

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystructid[0] != '\0') &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        {
        job_mutex.set_lock_on_exit(false);
        return(-1);
        }

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    remove_stagein(&pjob);

    job_mutex.set_lock_on_exit(false);

    if (pjob != NULL)
      job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      job_mutex.set_lock_on_exit(false);
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Exemple #18
0
int release_array_range(

  job_array            *pa,
  struct batch_request *preq,
  char                 *range_str)

  {
  tlist_head tl;
  int i;
  int rc;
  job                *pjob;

  array_request_node *rn;
  array_request_node *to_free;
  
  char *range = strchr(range_str,'=');
  if (range == NULL)
    return(PBSE_IVALREQ);

  range++; /* move past the '=' */
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(PBSE_IVALREQ);
    }
  
  /* hold just that range from the array */
  rn = (array_request_node*)GET_NEXT(tl);
  
  while (rn != NULL)
    {
    for (i = rn->start; i <= rn->end; i++)
      {
      /* don't stomp on other memory */
      if (i >= pa->ai_qs.array_size)
        continue;

      if (pa->job_ids[i] == NULL)
        continue;

      if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
        {
        free(pa->job_ids[i]);
        pa->job_ids[i] = NULL;
        }
      else
        {
        if ((rc = release_job(preq,pjob)))
          {
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          return(rc);
          }
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
        }
      }
    
    /* release mem */
    to_free = rn;
    rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
    free(to_free);
    }

  return(PBSE_NONE);
  } /* END release_array_range() */
int local_move(

  job                  *pjob,
  int                  *my_err,
  struct batch_request *req)

  {
  pbs_queue *dest_que = NULL;
  char      *destination = pjob->ji_qs.ji_destin;
  int        mtype;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  char       job_id[PBS_MAXSVRJOBID+1];
  int        rc;
  bool       reservation = false;

  /* Sometimes multiple threads are trying to route the same job. Protect against this
   * by making sure that the destionation queue and the current queue are different. 
   * If they are the same then consider it done correctly */
  if (!strcmp(pjob->ji_qs.ji_queue, pjob->ji_qs.ji_destin))
    return(PBSE_NONE);

  if (LOGLEVEL >= 8)
    {
    sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

  /*
   * if being moved at specific request of administrator, then
   * checks on queue availability, etc. are skipped;
   * otherwise all checks are enforced.
   */
  if (req == 0)
    {
    mtype = MOVE_TYPE_Route; /* route */
    }
  else if (req->rq_perm & (ATR_DFLAG_MGRD | ATR_DFLAG_MGWR))
    {
    mtype = MOVE_TYPE_MgrMv; /* privileged move */
    }
  else
    {
    mtype = MOVE_TYPE_Move; /* non-privileged move */
    }

  strcpy(job_id, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

  dest_que = find_queuebyname(destination);
  if (dest_que == NULL)
    {
    /* this should never happen */
    sprintf(log_buf, "queue %s does not exist\n", pjob->ji_qs.ji_queue);
    log_err(-1, __func__, log_buf);

    *my_err = PBSE_UNKQUE;
    return(-1);
    }

  mutex_mgr dest_que_mutex = mutex_mgr(dest_que->qu_mutex, true);
  if ((pjob = svr_find_job(job_id, TRUE)) == NULL)
    {
    /* job disappeared while locking queue */
    return(PBSE_JOB_RECYCLED);
    }

  /* check the destination */
  if ((*my_err = svr_chkque(pjob, dest_que, get_variable(pjob, pbs_o_host), mtype, NULL)))
    {
    /* should this queue be retried? */
    return(should_retry_route(*my_err));
    }

  reservation = have_reservation(pjob, dest_que);
  /* dequeue job from present queue, update destination and */
  /* queue_rank for new queue and enqueue into destination  */
  dest_que_mutex.unlock();
  rc = svr_dequejob(pjob, FALSE); 
  if (rc)
    return(rc);

  snprintf(pjob->ji_qs.ji_queue, sizeof(pjob->ji_qs.ji_queue), "%s", destination);

  pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank;
    
  if ((*my_err = svr_enquejob(pjob, FALSE, NULL, reservation, false)) == PBSE_JOB_RECYCLED)
    return(-1);

  if (*my_err != PBSE_NONE)
    {
    return(-1); /* should never ever get here */
    }

  if (pjob != NULL)
    {
    pjob->ji_lastdest = 0; /* reset in case of another route */
    
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  return(PBSE_NONE);
  }  /* END local_move() */
Exemple #20
0
int modify_array_range(

  job_array *pa,              /* I/O */
  char      *range,           /* I */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  tlist_head          tl;
  int                 i;
  int                 rc;
  int                 mom_relay = 0;
  job                *pjob;

  array_request_node *rn;
  array_request_node *to_free;
  
  CLEAR_HEAD(tl);
  
  if (parse_array_request(range,&tl) > 0)
    {
    /* don't hold the jobs if range error */
    
    return(FAILURE);
    }
  else 
    {
    /* hold just that range from the array */
    rn = (array_request_node*)GET_NEXT(tl);
    
    while (rn != NULL)
      {
      for (i = rn->start; i <= rn->end; i++)
        {
        if ((i >= pa->ai_qs.array_size) ||
            (pa->job_ids[i] == NULL))
          continue;

        if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          pthread_mutex_unlock(pa->ai_mutex);
          rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);
          pa = get_jobs_array(&pjob);
          
          if (pjob != NULL)
            {
            if (rc == PBSE_RELAYED_TO_MOM)
              {
              struct batch_request *array_req = NULL;
              
              /* We told modify_job not to call relay_to_mom so we need to contact the mom */
              if ((rc = copy_batchrequest(&array_req, preq, 0, i)) != PBSE_NONE)
                {
                return(rc);
                }
              
              preq->rq_refcount++;
              if (mom_relay == 0)
                {
                preq->rq_refcount++;
                }
              mom_relay++;
              
              /* The array_req is freed in relay_to_mom (failure)
               * or in issue_Drequest (success) */
              
              if ((rc = relay_to_mom(&pjob, array_req, NULL)))
                {
                snprintf(log_buf,sizeof(log_buf),
                  "Unable to relay information to mom for job '%s'\n",
                  pjob->ji_qs.ji_jobid);
                log_err(rc, __func__, log_buf);
                
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                
                return(rc); /* unable to get to MOM */
                }
              else
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                post_modify_arrayreq(array_req);
                }
              }
            else
              unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }
          else
            pa->job_ids[i] = NULL;

          }
        }
      
      /* release mem */
      to_free = rn;
      rn = (array_request_node*)GET_NEXT(rn->request_tokens_link);
      free(to_free);
      }
    }

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(PBSE_NONE);
  } /* END modify_array_range() */
void finish_moving_processing(

  job                  *pjob,
  struct batch_request *req,
  int                   status)

  {
  char         log_buf[LOCAL_LOG_BUF_SIZE];

  int          newstate;
  int          newsub;

  if (req->rq_type != PBS_BATCH_MoveJob)
    {
    sprintf(log_buf, "bad request type %d\n", req->rq_type);

    log_err(-1, __func__, log_buf);

    return;
    }

  if (pjob == NULL)
    return;

  switch (status)
    {
    case LOCUTION_SUCCESS:

      /* purge server's job structure */
      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(&pjob);

      if (pjob != NULL)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
          remove_checkpoint(&pjob);
        }

      snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob);
      snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager,
        req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host);

      if (pjob != NULL)
        svr_job_purge(pjob);
    
      reply_ack(req);

      break;
  
    default:

      status = PBSE_ROUTEREJ;

      if (pjob != NULL)
        {
        /* force re-eval of job state out of Transit */
        svr_evaljobstate(*pjob, newstate, newsub, 1);
        svr_setjobstate(pjob, newstate, newsub, FALSE);
   
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
        }

      req_reject(status, 0, req, NULL, NULL);

      break;
    } /* END switch (status) */
  
  } /* END finish_moving_processing() */
Exemple #22
0
/**
 * update_array_values()
 *
 * updates internal bookeeping values for job arrays
 * @param pa - array to update
 * @param pjob - the pjob that an event happened on
 * @param event - code for what event just happened
 */
void update_array_values(

  job_array            *pa,        /* I */
  int                   old_state, /* I */
  enum ArrayEventsEnum  event,     /* I */
  char                 *job_id,
  long                  job_atr_hold,
  int                   job_exit_status)

  {
  long  moab_compatible;

  switch (event)
    {
    case aeQueue:

      /* NYI, nothing needs to be done for this yet */

      break;

    case aeRun:

      if (old_state != JOB_STATE_RUNNING)
        {
        pa->ai_qs.jobs_running++;
        pa->ai_qs.num_started++;
        }

      break;

    case aeTerminate:

      if (old_state == JOB_STATE_RUNNING)
        {
        if (pa->ai_qs.jobs_running > 0)
          pa->ai_qs.jobs_running--;
        }

      if (job_exit_status == 0)
        {
        pa->ai_qs.num_successful++;
        pa->ai_qs.jobs_done++;
        }
      else
        {
        pa->ai_qs.num_failed++;
        pa->ai_qs.jobs_done++;
        }

      array_save(pa);

      /* update slot limit hold if necessary */
      if (get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible) != PBSE_NONE)
        moab_compatible = FALSE;

      if (moab_compatible != FALSE)
        {
        /* only need to update if the job wasn't previously held */
        if ((job_atr_hold & HOLD_l) == FALSE)
          {
          int  i;
          int  newstate;
          int  newsub;
          job *pj;

          /* find the first held job and release its hold */
          for (i = 0; i < pa->ai_qs.array_size; i++)
            {
            if (pa->job_ids[i] == NULL)
              continue;

            if (!strcmp(pa->job_ids[i], job_id))
              continue;

            if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL)
              {
              free(pa->job_ids[i]);
              pa->job_ids[i] = NULL;
              }
            else
              {
              if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
                {
                pj->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
                
                if (pj->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                  {
                  pj->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                  }
                
                svr_evaljobstate(pj, &newstate, &newsub, 1);
                svr_setjobstate(pj, newstate, newsub, FALSE);
                job_save(pj, SAVEJOB_FULL, 0);
                unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
                
                break;
                }

              unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
              }
            }
          }
        }

      break;

    default:

      /* log error? */

      break;
    }

  set_array_depend_holds(pa);
  array_save(pa);

  } /* END update_array_values() */
int process_request(

  struct tcp_chan *chan) /* file descriptor (socket) to get request */

  {
  int                   rc = PBSE_NONE;
  struct batch_request *request = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  acl_enable = FALSE;
  long                  state = SV_STATE_DOWN;

  time_t                time_now = time(NULL);
  int                   free_request = TRUE;
  char                  tmpLine[MAXLINE];
  char                 *auth_err = NULL;
  enum conn_type        conn_active;
  unsigned short        conn_socktype;
  unsigned short        conn_authen;
  unsigned long         conn_addr;
  int                   sfds = chan->sock;

  pthread_mutex_lock(svr_conn[sfds].cn_mutex);
  conn_active = svr_conn[sfds].cn_active;
  conn_socktype = svr_conn[sfds].cn_socktype;
  conn_authen = svr_conn[sfds].cn_authen;
  conn_addr = svr_conn[sfds].cn_addr;
  svr_conn[sfds].cn_lasttime = time_now;
  pthread_mutex_unlock(svr_conn[sfds].cn_mutex);

  if ((request = alloc_br(0)) == NULL)
    {
    snprintf(tmpLine, sizeof(tmpLine),
        "cannot allocate memory for request from %lu",
        conn_addr);
    req_reject(PBSE_MEM_MALLOC, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_SYSTEM;
    goto process_request_cleanup;
    }

  request->rq_conn = sfds;

  /*
   * Read in the request and decode it to the internal request structure.
   */
  if (conn_active == FromClientDIS || conn_active == ToServerDIS)
    {
#ifdef ENABLE_UNIX_SOCKETS

    if ((conn_socktype & PBS_SOCK_UNIX) &&
        (conn_authen != PBS_NET_CONN_AUTHENTICATED))
      {
      /* get_creds interestingly always returns 0 */
      get_creds(sfds, conn_credent[sfds].username, conn_credent[sfds].hostname);
      }

#endif /* END ENABLE_UNIX_SOCKETS */
    rc = dis_request_read(chan, request);
    }
  else
    {
    char out[80];

    snprintf(tmpLine, MAXLINE, "request on invalid type of connection: %d, sock type: %d, from address %s", 
                conn_active,conn_socktype, netaddr_long(conn_addr, out));
    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST,
      "process_req", tmpLine);
    snprintf(tmpLine, sizeof(tmpLine),
        "request on invalid type of connection (%d) from %s",
        conn_active,
        netaddr_long(conn_addr, out));
    req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_BADHOST;
    goto process_request_cleanup;
    }

  if (rc == -1)
    {
    /* FAILURE */
    /* premature end of file */
    rc = PBSE_PREMATURE_EOF;
    goto process_request_cleanup;
    }

  if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL) || (rc == PBSE_SOCKET_CLOSE))
    {
    /* FAILURE */
    /* read error, likely cannot send reply so just disconnect */
    /* ??? not sure about this ??? */
    goto process_request_cleanup;
    }

  if (rc > 0)
    {
    /* FAILURE */

    /*
     * request didn't decode, either garbage or unknown
     * request type, in either case, return reject-reply
     */

    req_reject(rc, 0, request, NULL, "cannot decode message");
    free_request = FALSE;
    goto process_request_cleanup;
    }

  if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME) != 0)
    {
    sprintf(log_buf, "%s: %lu",
      pbse_to_txt(PBSE_BADHOST),
      conn_addr);

    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, "", log_buf);

    snprintf(tmpLine, sizeof(tmpLine),
        "cannot determine hostname for connection from %lu",
        conn_addr);

    req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
    free_request = FALSE;
    rc = PBSE_BADHOST;
    goto process_request_cleanup;
    }

  if (LOGLEVEL >= 1)
    {
    sprintf(log_buf,
      msg_request,
      reqtype_to_txt(request->rq_type),
      request->rq_user,
      request->rq_host,
      sfds);

    log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, "", log_buf);
    }

  /* is the request from a host acceptable to the server */
  if (conn_socktype & PBS_SOCK_UNIX)
    {
    strcpy(request->rq_host, server_name);
    }

  get_svr_attr_l(SRV_ATR_acl_host_enable, &acl_enable);
  if (acl_enable)
    {
    /* acl enabled, check it; always allow myself and nodes */
    struct array_strings *pas = NULL;
    struct pbsnode       *isanode;

    get_svr_attr_arst(SRV_ATR_acl_hosts, &pas);
    isanode = PGetNodeFromAddr(conn_addr);

    if ((isanode == NULL) &&
        (strcmp(server_host, request->rq_host) != 0) &&
        (acl_check_my_array_string(pas, request->rq_host, ACL_Host) == 0))
      {
      char tmpLine[MAXLINE];
      snprintf(tmpLine, sizeof(tmpLine), "request not authorized from host %s",
               request->rq_host);

      req_reject(PBSE_BADHOST, 0, request, NULL, tmpLine);
      free_request = FALSE;
      rc = PBSE_BADHOST;
      goto process_request_cleanup;
      }

    if (isanode != NULL)
      unlock_node(isanode, "process_request", NULL, LOGLEVEL);
    }

  /*
   * determine source (user client or another server) of request.
   * set the permissions granted to the client
   */

  if (conn_authen == PBS_NET_CONN_FROM_PRIVIL)
    {
    /* request came from another server */

    request->rq_fromsvr = 1;

    request->rq_perm =
      ATR_DFLAG_USRD | ATR_DFLAG_USWR |
      ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
      ATR_DFLAG_MGRD | ATR_DFLAG_MGWR |
      ATR_DFLAG_SvWR;
    }
  else
    {
    /* request not from another server */
    conn_credent[sfds].timestamp = time_now;

    request->rq_fromsvr = 0;

    /*
     * Client must be authenticated by an Authenticate User Request, if not,
     * reject request and close connection.  -- The following is retained for
     * compat with old cmds -- The exception to this is of course the Connect
     * Request which cannot have been authenticated, because it contains the
     * needed ticket; so trap it here.  Of course, there is no prior
     * authentication on the Authenticate User request either, but it comes
     * over a reserved port and appears from another server, hence is
     * automatically granted authentication.
     *
     * The above is only true with inet sockets.  With unix domain sockets, the
     * user creds were read before the first dis_request_read call above.
     * We automatically granted authentication because we can trust the socket
     * creds.  Authorization is still granted in svr_get_privilege below
     */

    if (request->rq_type == PBS_BATCH_Connect)
      {
      req_connect(request);

      if (conn_socktype == PBS_SOCK_INET)
        {
        rc = PBSE_IVALREQ;
        req_reject(rc, 0, request, NULL, NULL);
        free_request = FALSE;
        goto process_request_cleanup;
        }

      }

    if (conn_socktype & PBS_SOCK_UNIX)
      {
      pthread_mutex_lock(svr_conn[sfds].cn_mutex);
      svr_conn[sfds].cn_authen = PBS_NET_CONN_AUTHENTICATED;
      pthread_mutex_unlock(svr_conn[sfds].cn_mutex);
      }

    if (ENABLE_TRUSTED_AUTH == TRUE )
      rc = PBSE_NONE;  /* bypass the authentication of the user--trust the client completely */
    else if (munge_on)
      {
      /* If munge_on is true we will validate the connection now */
      if (request->rq_type == PBS_BATCH_AltAuthenUser)
        {
        rc = req_altauthenuser(request);
        free_request = FALSE;
        goto process_request_cleanup;
        }
      else
        {
        rc = authenticate_user(request, &conn_credent[sfds], &auth_err);
        }
      }
    else if (conn_authen != PBS_NET_CONN_AUTHENTICATED)
      /* skip checking user if we did not get an authenticated credential */
      rc = PBSE_BADCRED;
    else
      rc = authenticate_user(request, &conn_credent[sfds], &auth_err);

    if (rc != 0)
      {
      req_reject(rc, 0, request, NULL, auth_err);
      if (auth_err != NULL)
        free(auth_err);
      free_request = FALSE;
      goto process_request_cleanup;
      }

    /*
     * pbs_mom and checkpoint restart scripts both need the authority to do
     * alters and releases on checkpointable jobs.  Allow manager permission
     * for root on the jobs execution node.
     */
     
    if (((request->rq_type == PBS_BATCH_ModifyJob) ||
        (request->rq_type == PBS_BATCH_ReleaseJob)) &&
        (strcmp(request->rq_user, PBS_DEFAULT_ADMIN) == 0))
      {
      job *pjob;
      char *dptr;
      int skip = FALSE;
      char short_host[PBS_MAXHOSTNAME+1];

      /* make short host name */

      strcpy(short_host, request->rq_host);
      if ((dptr = strchr(short_host, '.')) != NULL)
        {
        *dptr = '\0';
        }
      
      if ((pjob = svr_find_job(request->rq_ind.rq_modify.rq_objname, FALSE)) != (job *)0)
        {
        if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
          {

          if ((pjob->ji_wattr[JOB_ATR_checkpoint].at_flags & ATR_VFLAG_SET) &&
              ((csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "s") != NULL) ||
               (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "c") != NULL) ||
               (csv_find_string(pjob->ji_wattr[JOB_ATR_checkpoint].at_val.at_str, "enabled") != NULL)) &&
              (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, short_host) != NULL))
            {
            request->rq_perm = svr_get_privilege(request->rq_user, server_host);
            skip = TRUE;
            }

          }
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }
      
      if (!skip)
        {
        request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host);
        }
      }
    else
      {
      request->rq_perm = svr_get_privilege(request->rq_user, request->rq_host);
      }
    }  /* END else (conn_authen == PBS_NET_CONN_FROM_PRIVIL) */

  /* if server shutting down, disallow new jobs and new running */
  get_svr_attr_l(SRV_ATR_State, &state);

  if (state > SV_STATE_RUN)
    {
    switch (request->rq_type)
      {
      case PBS_BATCH_AsyrunJob:
      case PBS_BATCH_JobCred:
      case PBS_BATCH_MoveJob:
      case PBS_BATCH_QueueJob:
      case PBS_BATCH_RunJob:
      case PBS_BATCH_StageIn:
      case PBS_BATCH_jobscript:

        req_reject(PBSE_SVRDOWN, 0, request, NULL, NULL);
        rc = PBSE_SVRDOWN;
        free_request = FALSE;
        goto process_request_cleanup;
        /*NOTREACHED*/

        break;
      }
    }

  /*
   * dispatch the request to the correct processing function.
   * The processing function must call reply_send() to free
   * the request struture.
   */

  rc = dispatch_request(sfds, request);

  return(rc);

process_request_cleanup:

  if (free_request == TRUE)
    free_br(request);

  return(rc);
  }  /* END process_request() */
Exemple #24
0
void update_array_statuses(
    
  job_array *owned)

  {
  job_array      *pa;
  job            *pj;
  job            *pjob;
  int             i;
  int             iter = -1;
  unsigned int    running;
  unsigned int    queued;
  unsigned int    held;
  unsigned int    complete;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  while ((pa = next_array_check(&iter, owned)) != NULL)
    {
    running = 0;
    queued = 0;
    held = 0;
    complete = 0;
    
    for (i = 0; i < pa->ai_qs.array_size; i++)
      {
      if (pa->job_ids[i] != NULL)
        {
        if ((pj = svr_find_job(pa->job_ids[i], TRUE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (pj->ji_qs.ji_state == JOB_STATE_RUNNING)
            {
            running++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_QUEUED)
            {
            queued++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_HELD)
            {
            held++;
            }
          else if (pj->ji_qs.ji_state == JOB_STATE_COMPLETE)
            {
            complete++;
            }
          
          unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
          }
        }
      }
     
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }

    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    
    if ((pjob = svr_find_job(pa->ai_qs.parent_id, TRUE)) != NULL)
      {
      if (running > 0)
        {
        svr_setjobstate(pjob, JOB_STATE_RUNNING, pjob->ji_qs.ji_substate, FALSE);
        }
      else if (held > 0 && queued == 0 && complete == 0)
        {
        svr_setjobstate(pjob, JOB_STATE_HELD, pjob->ji_qs.ji_substate, FALSE);
        }
      else if (complete > 0 && queued == 0 && held == 0)
        {
        svr_setjobstate(pjob, JOB_STATE_COMPLETE, pjob->ji_qs.ji_substate, FALSE);
        }
      else 
        {
        /* default to just calling the array queued */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, pjob->ji_qs.ji_substate, FALSE);
        }

      unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
      }
      
    if (pa == owned)
      {
      lock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }
  } /* END update_array_statuses() */
void post_signal_req(

  batch_request *preq)

  {
  char                 *jobid;
  job                  *pjob;

  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* request has been handled elsewhere */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if (preq->rq_reply.brp_code)
    {
    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_REQUEST,
      preq->rq_ind.rq_signal.rq_jid,
      pbse_to_txt(PBSE_MOMREJECT));

    errno = 0;

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    }
  else
    {
    if ((jobid = preq->rq_extra) == NULL)
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory! FAILURE");
      return;
      }

    if ((pjob = svr_find_job(jobid, FALSE)) != NULL)
      {
      if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) == 0)
        {
        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0)
          {
          pjob->ji_qs.ji_svrflags |= JOB_SVFLG_Suspend;
          
          set_statechar(pjob);
          
          job_save(pjob, SAVEJOB_QUICK, 0);
          
          /* release resources allocated to suspended job - NORWAY */
          
          free_nodes(pjob);
          }
        }
      else if (strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) == 0)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend)
          {
          /* re-allocate assigned node to resumed job - NORWAY */
          
          set_old_nodes(pjob);
          
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend;
          
          set_statechar(pjob);
          
          job_save(pjob, SAVEJOB_QUICK, 0);
          }
        }
    
      unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
      }
    else
      {
      /* job is gone */
      snprintf(log_buf,sizeof(log_buf),
        "Cannot find job '%s', assuming success",
        jobid);
      log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }

    free(jobid);

    reply_ack(preq);
    }

  return;
  }  /* END post_signal_req() */
Exemple #26
0
int issue_signal(

  job        **pjob_ptr,
  const char  *signame, /* name of the signal to send */
  void       (*func)(struct batch_request *),
  void        *extra, /* extra parameter to be stored in sig request */
  char        *extend) /* Parameter to put in extended part of request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;
  newreq->rq_extend = extend;
  if (extend != NULL)
    {
    newreq->rq_extsz = strlen(extend);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else if ((extend != NULL) && 
      (!strcmp(extend, RERUNFORCE)))
    {
    if (pjob == NULL)
      {
      *pjob_ptr = svr_find_job((char *)jobid, TRUE);
      pjob = *pjob_ptr;
      }
    /* The job state is normally set when the obit arrives. But since the 
       MOM is not responding we need to set the state here */

    if (pjob != NULL)
      {
      /* Rerunning job, if not checkpointed, clear "resources_used and requeue job */
      if ((pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) == 0)
        {
        job_attr_def[JOB_ATR_resc_used].at_free(&pjob->ji_wattr[JOB_ATR_resc_used]);
        }
      else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
        {
        /* non-migratable checkpoint (cray), leave there */
        /* and just requeue the job         */

        rel_resc(pjob);

        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;

        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);

        pjob->ji_momhandle = -1;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);

        return(PBSE_SYSTEM);
        }

      rel_resc(pjob); /* free resc assigned to job */

      /* Now re-queue the job */
      pjob->ji_modified = 1; /* force full job save */

      pjob->ji_momhandle = -1;
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
      func(newreq);

      rc = PBSE_NONE;
      }
    else
      rc = PBSE_JOBNOTFOUND;
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */
Exemple #27
0
int req_stat_job(

  struct batch_request *preq)  /* ptr to the decoded request */

  {
  struct stat_cntl      cntl; /* see svrfunc.h  */
  char                 *name;
  job                  *pjob = NULL;
  pbs_queue            *pque = NULL;
  int                   rc = PBSE_NONE;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  bool                  condensed = false;

  enum TJobStatTypeEnum type = tjstNONE;

  /*
   * first, validate the name of the requested object, either
   * a job, a queue, or the whole server.
   */
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "note");
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }


  /* FORMAT:  name = { <JOBID> | <QUEUEID> | '' } */

  name = preq->rq_ind.rq_status.rq_id;

  if (preq->rq_extend != NULL)
    {
    /* evaluate pbs_job_stat() 'extension' field */

    if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated")))
      {
      /* truncate response by 'max_report' */

      type = tjstTruncatedServer;
      }
    else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays")))
      {
      type = tjstSummarizeArraysServer;
      }

    if (preq->rq_extend[strlen(preq->rq_extend) - 1] == 'C')
      {
      condensed = true;
      }

    }    /* END if (preq->rq_extend != NULL) */

  if (isdigit((int)*name))
    {
    /* status a single job */

    if (is_array(name))
      {
      if (type != tjstSummarizeArraysServer)
        {
        type = tjstArray;
        }
      }
    else
      {
      type = tjstJob;

      if ((pjob = svr_find_job(name, FALSE)) == NULL)
        {
        rc = PBSE_UNKJOBID;
        }
      else
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    }
  else if (isalpha(name[0]))
    {
    if (type == tjstNONE)
      type = tjstQueue;
    else if (type == tjstSummarizeArraysServer)
      type = tjstSummarizeArraysQueue;
    else
      type = tjstTruncatedQueue;

    /* if found, this mutex is released later */
    if ((pque = find_queuebyname(name)) == NULL)
      {
      rc = PBSE_UNKQUE;
      }
    }
  else if ((*name == '\0') || (*name == '@'))
    {
    /* status all jobs at server */

    if (type == tjstNONE)
      type = tjstServer;
    }
  else
    {
    rc = PBSE_IVALREQ;
    }

  if (rc != 0)
    {
    /* is invalid - an error */
    req_reject(rc, 0, preq, NULL, NULL);

    return(rc);
    }

  set_reply_type(&preq->rq_reply, BATCH_REPLY_CHOICE_Status);

  CLEAR_HEAD(preq->rq_reply.brp_un.brp_status);

  if ((type == tjstTruncatedQueue) ||
      (type == tjstTruncatedServer))
    {
    if (pque != NULL)
      {
      unlock_queue(pque, __func__, "", LOGLEVEL);
      pque = NULL;
      }
    }

  memset(&cntl, 0, sizeof(cntl));

  cntl.sc_type   = (int)type;
  cntl.sc_conn   = -1;
  cntl.sc_pque   = pque;
  cntl.sc_origrq = preq;
  cntl.sc_post   = req_stat_job_step2;
  cntl.sc_jobid[0] = '\0'; /* cause "start from beginning" */
  cntl.sc_condensed = condensed;

  req_stat_job_step2(&cntl); /* go to step 2, see if running is current */

  if (pque != NULL)
    unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL);

  return(PBSE_NONE);
  }  /* END req_stat_job() */
Exemple #28
0
int req_holdarray(
    
  void *vp) /* I */

  {
  int                   i;
  struct batch_request *preq = (struct batch_request *)vp;
  char                 *pset;
  char                 *range_str;
  int                   rc;
  pbs_attribute         temphold;
  char                  owner[PBS_MAXUSER + 1];
  job_array            *pa;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  pa = get_array(preq->rq_ind.rq_hold.rq_orig.rq_objname);

  if (pa == NULL)
    {
    /* this shouldn't happen since we verify that this is a valid array
       just prior to calling this function */
    req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
    return(PBSE_NONE);
    }

  get_jobowner(pa->ai_qs.owner, owner);

  if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1)
    {
    sprintf(log_buf, msg_permlog,
      preq->rq_type, "Array", preq->rq_ind.rq_delete.rq_objname, preq->rq_user, preq->rq_host);

    log_event(PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_delete.rq_objname, log_buf);

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
      }

    pthread_mutex_unlock(pa->ai_mutex);

    req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted");
    return(PBSE_NONE);
    }


  if ((rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset,
                     &temphold)) != 0)
    {
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
      }
    
    pthread_mutex_unlock(pa->ai_mutex);

    req_reject(rc, 0, preq, NULL, NULL);
    return(PBSE_NONE);
    }

  /* if other than HOLD_u is being set, must have privil */

  if ((rc = chk_hold_priv(temphold.at_val.at_long, preq->rq_perm)) != 0)
    {
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
      }

    pthread_mutex_unlock(pa->ai_mutex);

    req_reject(rc, 0, preq, NULL, NULL);
    return(PBSE_NONE);
    }

  /* get the range of jobs to iterate over */
  range_str = preq->rq_extend;
  if ((range_str != NULL) &&
      (strstr(range_str,ARRAY_RANGE) != NULL))
    {
    if ((rc = hold_array_range(pa,range_str,&temphold)) != 0)
      {
      pthread_mutex_unlock(pa->ai_mutex);

      req_reject(rc,0,preq,NULL,
        "Error in specified array range");
      return(PBSE_NONE);
      }
    }
  else
    {
    /* do the entire array */
    for (i = 0;i < pa->ai_qs.array_size;i++)
      {
      if (pa->job_ids[i] == NULL)
        continue;

      if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
        {
        free(pa->job_ids[i]);
        pa->job_ids[i] = NULL;
        }
      else
        {
        hold_job(&temphold,pjob);
        if (LOGLEVEL >= 7)
          {
          sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
          log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
          }

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }
      }
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
    }
  pthread_mutex_unlock(pa->ai_mutex);

  reply_ack(preq);

  return(PBSE_NONE);
  } /* END req_holdarray() */
Exemple #29
0
void *queue_route(

    void *vp)

{
    pbs_queue *pque;
    job       *pjob = NULL;
    char      *queue_name;
    char       log_buf[LOCAL_LOG_BUF_SIZE];

    all_jobs_iterator   *iter = NULL;

    queue_name = (char *)vp;

    if (queue_name == NULL)
    {
        sprintf(log_buf, "NULL queue name");
        log_err(-1, __func__, log_buf);
        return(NULL);
    }

    while (1)
    {
        pthread_mutex_lock(reroute_job_mutex);
        /* Before we attempt to service this queue, make sure we can find it. */
        pque = find_queuebyname(queue_name);
        if (pque == NULL)
        {
            sprintf(log_buf, "Could not find queue %s", queue_name);
            log_err(-1, __func__, log_buf);
            free(queue_name);
            return(NULL);
        }

        mutex_mgr que_mutex(pque->qu_mutex, true);

        pque->qu_jobs->lock();
        iter = pque->qu_jobs->get_iterator();
        pque->qu_jobs->unlock();

        if (LOGLEVEL >= 7)
        {
            snprintf(log_buf, sizeof(log_buf), "routing any ready jobs in queue: %s", queue_name);
            log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf);
        }

        while ((pjob = next_job(pque->qu_jobs,iter)) != NULL)
        {
            /* We only want to try if routing has been tried at least once - this is to let
             * req_commit have the first crack at routing always. */

            if (pjob->ji_commit_done == 0) /* when req_commit is done it will set ji_commit_done to 1 */
            {
                unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
                continue;
            }
            /* queue must be unlocked when calling reroute_job */
            que_mutex.unlock();
            reroute_job(pjob);
            unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            /* need to relock queue when we go to call next_job */
            pque = find_queuebyname(queue_name);
            if (pque == NULL)
            {
                sprintf(log_buf, "Could not find queue %s", queue_name);
                log_err(-1, __func__, log_buf);
                free(queue_name);
                delete iter;
                return(NULL);
            }
            que_mutex.mark_as_locked();
        }

        /* we come out of the while loop with the queue locked.
           We don't want it locked while we sleep */
        que_mutex.unlock();
        pthread_mutex_unlock(reroute_job_mutex);
        delete iter;
        sleep(route_retry_interval);
    }

    free(queue_name);
    return(NULL);
} /* END queue_route() */
int get_parent_dest_queues(

  char       *queue_parent_name,
  char       *queue_dest_name,
  pbs_queue **parent,
  pbs_queue **dest,
  job       **pjob_ptr)

  {
  pbs_queue *pque_parent;
  pbs_queue *pque_dest;
  char       jobid[PBS_MAXSVRJOBID + 1];
  char       log_buf[LOCAL_LOG_BUF_SIZE + 1];
  job       *pjob = *pjob_ptr;
  int        index_parent;
  int        index_dest;
  int        rc = PBSE_NONE;

  strcpy(jobid, pjob->ji_qs.ji_jobid);

  if ((queue_parent_name != NULL) && (queue_dest_name != NULL))
    {
    if (!strcmp(queue_parent_name, queue_dest_name))
      {
      /* parent and destination are the same. 
         Job is already in destnation queue. return */
      snprintf(log_buf, sizeof(log_buf), "parent and destination queues are the same: parent %s - dest %s. jobid: %s",
          queue_parent_name,
          queue_dest_name,
          jobid);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(-1); 
      }
    }
  else
    return(-1);

  unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

  unlock_queue(*parent, __func__, (char *)NULL, 0);

  *parent = NULL;
  *dest   = NULL;

  pthread_mutex_lock(svr_queues.allques_mutex);

  index_parent = get_value_hash(svr_queues.ht, queue_parent_name);
  index_dest   = get_value_hash(svr_queues.ht, queue_dest_name);

  if ((index_parent < 0) ||
      (index_dest < 0))
    {
    rc = -1;
    }
  else
    {
    /* good path */
    pque_parent = svr_queues.ra->slots[index_parent].item;
    pque_dest   = svr_queues.ra->slots[index_dest].item;

    if ((pque_parent == NULL) ||
        (pque_dest == NULL))
      {
      rc = -1;
      }
    else
      {
      /* SUCCESS! */
      lock_queue(pque_parent, __func__, (char *)NULL, 0);
      lock_queue(pque_dest,   __func__, (char *)NULL, 0);
      *parent = pque_parent;
      *dest = pque_dest;

      rc = PBSE_NONE;
      }
    }

  pthread_mutex_unlock(svr_queues.allques_mutex);

  if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL)
    rc = -1;

  return(rc);
  } /* END get_parent_dest_queues() */