Example #1
0
END_TEST

START_TEST(svr_find_job_test)
  {
  struct job* result = svr_find_job(NULL,0);
  fail_unless(result == NULL, "NULL job id input fail");

  result = svr_find_job((char *)"",0);
  fail_unless(result == NULL, "empty job id input fail");
  }
Example #2
0
void *single_delete_work(

  void *vp)

  {
  int              rc = -1;
  batch_request   *preq = (batch_request *)vp;
  char            *jobid = preq->rq_ind.rq_delete.rq_objname;
  job             *pjob;
  char            *Msg = preq->rq_extend;

  // TRUE is the same for non-heterogeneous jobs as FALSE. For heterogeneous
  // jobs simply delete one to trigger the other being deleted as well.
  pjob = svr_find_job(jobid, TRUE);

  if (pjob == NULL)
    {
    req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "job unexpectedly deleted");
    }
  else
    {
    /* mutex is freed below */
    if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE)
      rc = execute_job_delete(pjob, Msg, preq);
 
    if ((rc == PBSE_NONE) ||
        (rc == PURGE_SUCCESS))
      reply_ack(preq);
    }

  return(NULL);
  } /* END single_delete_work() */
Example #3
0
int release_whole_array(

  job_array            *pa,   /* I/0 */
  struct batch_request *preq) /* I */

  {
  int  i;
  int  rc;
  job *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      if ((rc = release_job(preq, pjob, pa)) != 0)
        return(rc);
      }
    }

  /* SUCCESS */
  return(PBSE_NONE);
  } /* END release_whole_array */
Example #4
0
job *chk_job_request(

  char                 *jobid,  /* I */
  struct batch_request *preq)   /* I */

  {
  job *pjob = NULL;

  if ((pjob = svr_find_job(jobid, FALSE)) == NULL)
    {
    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      jobid,
      pbse_to_txt(PBSE_UNKJOBID));

    req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job");

    return(NULL);
    }

  /* if we aren't authorized, pjob will be set to NULL in chk_job_req_permissions */
  chk_job_req_permissions(&pjob,preq);

  return(pjob);
  }  /* END chk_job_request() */
Example #5
0
pbs_queue *lock_queue_with_job_held(

  pbs_queue  *pque,
  job       **pjob_ptr)

  {
  char       jobid[PBS_MAXSVRJOBID + 1];
  job       *pjob = *pjob_ptr;


  if (pque != NULL)
    {
    if (LOGLEVEL >= 10 )
      log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pque->qu_qs.qu_name);
    
    if (pthread_mutex_trylock(pque->qu_mutex))
      {
      /* if fail */
      strcpy(jobid, pjob->ji_qs.ji_jobid);
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      lock_queue(pque, __func__, NULL, LOGLEVEL);

      if ((pjob = svr_find_job(jobid, TRUE)) == NULL)
        {
        unlock_queue(pque, __func__, NULL, 0);
        pque = NULL;
        *pjob_ptr = NULL;
        }
      }
    }

  return(pque);
  } /* END lock_queue_with_job_held() */
Example #6
0
pbs_queue *lock_queue_with_job_held(

  pbs_queue  *pque,
  job       **pjob_ptr)

  {
  char       jobid[PBS_MAXSVRJOBID];
  job       *pjob = *pjob_ptr;

  if (pque != NULL)
    {
    if (pthread_mutex_trylock(pque->qu_mutex))
      {
      /* if fail */
      strcpy(jobid, pjob->ji_qs.ji_jobid);
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      lock_queue(pque, __func__, NULL, LOGLEVEL);

      if ((pjob = svr_find_job(jobid, TRUE)) == NULL)
        {
        unlock_queue(pque, __func__, NULL, 0);
        pque = NULL;
        *pjob_ptr = NULL;
        }
      }
    }

  return(pque);
  } /* END get_jobs_queue() */
Example #7
0
/* 
 * check_exiting_jobs()
 *
 * loops over the recorded exiting job information and retries
 * jobs that have been stale long enough.
 */
int check_exiting_jobs()

  {
  exiting_jobs_info_iterator  *iter = NULL;
  char                   *jobid;
  job                    *pjob;
  
  while ((jobid = get_next_retryable_jobid(&iter)) != NULL)
    {
    if ((pjob = svr_find_job(jobid, TRUE)) == NULL)
      {
      remove_from_exiting_list_by_jobid(jobid);
      free(jobid);
      }
    else
      {
      mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

      if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE)
        {
        remove_from_exiting_list_by_jobid(jobid);
        free(jobid);
        }
      else
        {
        pjob_mutex.unlock();

        /* jobid is freed in on_job_exit() */
        retry_job_exit(jobid);
        }
      }
    } /* END loop over exiting job information */

  return(PBSE_NONE);
  } /* END check_exiting_jobs() */
Example #8
0
int record_reservation(

  struct pbsnode *pnode,
  char           *rsv_id)

  {
  struct pbssubn *sub_node;
  job            *pjob;
  int             found_job = FALSE;
  
  for (sub_node = pnode->nd_psn; sub_node != NULL; sub_node = sub_node->next)
    {
    if (sub_node->jobs != NULL)
      {
      if ((pjob = svr_find_job(sub_node->jobs->jobid, TRUE)) != NULL)
        {
        pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
        pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

        create_alps_reservation(pjob);
        found_job = TRUE;

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        break;
        }
      }
    }

  if (found_job == FALSE)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
Example #9
0
int retry_job_exit(

  job_exiting_retry_info *jeri)

  {
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  job  *pjob;

  jeri->attempts++;

  if (jeri->attempts >= MAX_EXITING_RETRY_ATTEMPTS)
    {
    /* job has been attempted the maximum number of times. Destroy the job */
    if ((pjob = svr_find_job(jeri->jobid, TRUE)) != NULL)
      {
      force_purge_work(pjob);
      }

    remove_entry_from_exiting_list(jeri);
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf), "Retrying job exiting for job %s",
      jeri->jobid);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    
    jeri->last_attempt = time(NULL);
    on_job_exit(NULL, strdup(jeri->jobid));
    }

  return(PBSE_NONE);
  } /* END retry_job_exit() */
Example #10
0
int handle_single_delete(

  struct batch_request *preq,
  struct batch_request *preq_tmp,
  char                 *Msg)

  {
  char *jobid = preq->rq_ind.rq_delete.rq_objname;
  job  *pjob = svr_find_job(jobid, FALSE);

  if (pjob == NULL)
    {
    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID));
    
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job");
    }
  else
    {
    unlock_ji_mutex(pjob, __func__, NULL, 0);

    /* send the asynchronous reply if needed */
    if (preq_tmp != NULL)
      {
      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      enqueue_threadpool_request(single_delete_work, preq);
      }
    else
      single_delete_work(preq);
    }

  return(PBSE_NONE);
  } /* END handle_single_delete() */
Example #11
0
job *get_next_status_job(

  struct stat_cntl  *cntl,
  int               &job_array_index,
  job_array         *pa,
  all_jobs_iterator *iter)

  {
  job *pjob = NULL;

  if (cntl->sc_type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);
  else if (cntl->sc_type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
  else if (cntl->sc_type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);
  else if (cntl->sc_type == tjstArray)
    {
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs, iter);

  return(pjob);
  } // END get_next_status_job()
Example #12
0
/*
 * send_sig_kill
 *
 * The SIGTERM has been sent and we've waited for the kill_delay so now send the SIGKILL.
 * @pre-cond: pwt must point to a valid task
 * @pre-cond: pwt->wt_parm1 must point to a valid character string
 *
 */
void send_sig_kill(
    
  struct work_task *pwt)

  {
  job                  *pjob;
  char                 *job_id = (char *)pwt->wt_parm1;
  static const char    *rerun = "rerun";

  free(pwt->wt_mutex);
  free(pwt);

  if (job_id == NULL)
    return;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    {
    free(job_id);
    return;
    }
  
  char *extra = strdup(rerun);

  free(job_id);

  if (issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL) == 0)
    {
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    }

  unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
  } /* END send_sig_kill() */
Example #13
0
void post_rerun(

  batch_request *preq)

  {
  int   newstate;
  int   newsub;
  job  *pjob;

  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buf, "rerun signal reject by mom: %s - %d", preq->rq_ind.rq_signal.rq_jid, preq->rq_reply.brp_code);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,__func__,log_buf);

    if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE)))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }
    }

  return;
  }  /* END post_rerun() */
int release_whole_array(

  job_array            *pa,   /* I/0 */
  struct batch_request *preq) /* I */

  {
  int  i;
  int  rc;
  job *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      if ((rc = release_job(preq, pjob)) != 0)
        {
        unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
        return(rc);
        }
  
      unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);
      }
    }

  /* SUCCESS */
  return(PBSE_NONE);
  } /* END release_whole_array */
Example #15
0
void *single_delete_work(

  void *vp)

  {
  int              rc = -1;
  batch_request   *preq = (batch_request *)vp;
  char            *jobid = preq->rq_ind.rq_delete.rq_objname;
  job             *pjob;
  char            *Msg = preq->rq_extend;

  pjob = svr_find_job(jobid, FALSE);

  if (pjob == NULL)
    {
    req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "job unexpectedly deleted");
    }
  else
    {
    /* mutex is freed below */
    if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE)
      rc = execute_job_delete(pjob, Msg, preq);
 
    if ((rc == PBSE_NONE) ||
        (rc == PURGE_SUCCESS))
      reply_ack(preq);
    }

  return(NULL);
  } /* END single_delete_work() */
Example #16
0
int check_exiting_jobs()

  {
  int                     iter = -1;
  job_exiting_retry_info *jeri;
  job                    *pjob;
  time_t                  time_now = time(NULL);
    
  while ((jeri = (job_exiting_retry_info *)next_from_hash_map(exiting_jobs_info, &iter)) != NULL)
    {
    if (time_now - jeri->last_attempt > EXITING_RETRY_TIME)
      {
      if ((pjob = svr_find_job(jeri->jobid, TRUE)) == NULL)
        {
        remove_entry_from_exiting_list(jeri);
        }
      else
        {
        if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE)
          {
          remove_entry_from_exiting_list(jeri);
          unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
          }
        else
          {
          unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
          retry_job_exit(jeri);
          }
        }
      }
    
    }

  return(PBSE_NONE);
  } /* END check_exiting_jobs() */
int is_orphaned(

  char *rsv_id)

  {
  int               index;
  int               orphaned = FALSE;
  job              *pjob;
  alps_reservation *ar = NULL;

  pthread_mutex_lock(alps_reservations.rh_mutex);
  index = get_value_hash(alps_reservations.rh_ht, rsv_id);
  if (index != -1)
    ar = (alps_reservation *)alps_reservations.rh_alps_rsvs->slots[index].item;
  pthread_mutex_unlock(alps_reservations.rh_mutex);

  if (ar != NULL)
    {
    if ((pjob = svr_find_job(ar->job_id, TRUE)) != NULL)
      {
      if (pjob->ji_qs.ji_state == JOB_STATE_COMPLETE)
        orphaned = TRUE;

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    else
      orphaned = TRUE;
    }
  else
    orphaned = TRUE;

  return(orphaned);
  } /* END is_orphaned() */
Example #18
0
void process_checkpoint_reply(

  batch_request *preq)

  {
  job *pjob;

  /* preq handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_manager.rq_objname,
      msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else
    {
    mutex_mgr job_mutex = mutex_mgr(pjob->ji_mutex, true);

    /* record that MOM has a checkpoint file */
    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */
    reply_ack(preq);
    }
  } /* END process_checkpoint_reply() */
Example #19
0
static void job_delete_nanny(

  struct work_task *pwt)

  {
  job                  *pjob;
  char                 *sigk = "SIGKILL";
  char                 *jobid;

  struct batch_request *newreq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  time_t                time_now = time(NULL);
  long                  nanny = FALSE;

  /* short-circuit if nanny isn't enabled */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    jobid = (char *)pwt->wt_parm1;
    
    if (jobid != NULL)
      {
      pjob = svr_find_job(jobid, FALSE);
      
      if (pjob != NULL)
        {
        sprintf(log_buf, "exiting job '%s' still exists, sending a SIGKILL", pjob->ji_qs.ji_jobid);
        log_err(-1, "job nanny", log_buf);
        
        /* build up a Signal Job batch request */
        if ((newreq = alloc_br(PBS_BATCH_SignalJob)) != NULL)
          {
          strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);
          snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", sigk);
          }
        
        issue_signal(&pjob, sigk, post_job_delete_nanny, newreq);
        
        if (pjob != NULL)
          {
          apply_job_delete_nanny(pjob, time_now + 60);
  
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }
        }
      }
    else
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory");
      }
    }
  
  if (pwt->wt_parm1 != NULL)
    free(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END job_delete_nanny() */
Example #20
0
void post_job_delete_nanny(

  batch_request *preq_sig)

  {
  int                   rc;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  nanny = 0;

  if (preq_sig == NULL)    
    return;

  rc       = preq_sig->rq_reply.brp_code;

  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    /* the admin disabled nanny within the last minute or so */
    free_br(preq_sig);

    return;
    }

  /* extract job id from task */
  pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE);

  if (pjob == NULL)
    {
    sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);
    }
  else if (rc == PBSE_UNKJOBID)
    {
    sprintf(log_buf, "job delete nanny returned, but does not exist on mom");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);

    free_nodes(pjob);

    set_resc_assigned(pjob, DECR);
  
    free_br(preq_sig);

    svr_job_purge(pjob);

    return;
    }
  
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  /* free task */
  free_br(preq_sig);

  return;
  } /* END post_job_delete_nanny() */
Example #21
0
job *svr_find_job_by_id(

  int internal_job_id)

  {
  const char *job_id = job_mapper.get_name(internal_job_id);

  return(svr_find_job(job_id, TRUE));
  }
Example #22
0
int modify_whole_array(

  job_array            *pa,             /* I/O */
  svrattrl             *plist,          /* I */
  struct batch_request *preq,           /* I */
  int                   checkpoint_req) /* I */

  {
  int   i;
  int   rc = PBSE_NONE;
  int   modify_job_rc = PBSE_NONE;
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      batch_request *array_req = duplicate_request(preq, i);
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pthread_mutex_unlock(pa->ai_mutex);
      array_req->rq_noreply = TRUE;
      rc = modify_job((void **)&pjob, plist, array_req, checkpoint_req, NO_MOM_RELAY);
      if (rc != PBSE_NONE)
        {
        modify_job_rc = rc;
        }
      pa = get_jobs_array(&pjob);
      
      if (pa == NULL)
        {
        if (pjob == NULL)
          job_mutex.set_lock_on_exit(false);

        return(PBSE_JOB_RECYCLED);
        }

      if (pjob == NULL)
        {
        pa->job_ids[i] = NULL;
        job_mutex.set_lock_on_exit(false);
        continue;
        }
      }
    } /* END foreach job in array */

  return(modify_job_rc);
  } /* END modify_whole_array() */
Example #23
0
void post_modify_req(

  batch_request *preq)

  {
  job  *pjob;
  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

  if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID))
    {
    sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code);

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_modify.rq_objname,
      log_buf);

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
    }
  else
    {
    if (preq->rq_reply.brp_code == PBSE_UNKJOBID)
      {
      if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL)
        {
        req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
        return;
        }
      else
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        if (LOGLEVEL >= 0)
          {
          sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s",
            pjob->ji_qs.ji_jobid,
            PJobState[pjob->ji_qs.ji_state],
            PJobSubState[pjob->ji_qs.ji_substate],
            pjob->ji_qs.ji_destin);

          log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
          }
        }
      }

    reply_ack(preq);
    }

  return;
  }  /* END post_modify_req() */
/*
 * record_reservation()
 *
 * @pre-cond: pnode and rsv_id must be valid pointers
 * @post-cond: the reservation will be recorded in pbs_server's tracking mechanism
 * and on the job which has the node reserved, or -1 is returned and the reservation
 * is not recorded.
 * @param - pnode the node which is reporting the reservation
 * @param - rsv_id the id of the reservation being reported
 * @return - PBSE_NONE if the reservation was successfully recorded, -1 otherwise
 */
int record_reservation(

  struct pbsnode *pnode,
  const char     *rsv_id)

  {
  job            *pjob;
  bool            found_job = false;
  char            jobid[PBS_MAXSVRJOBID + 1];

  for (unsigned int i = 0; i < pnode->nd_job_usages.size(); i++)
    {
    /* cray only allows one job per node, so any valid job will be the job that is 
     * reserving this node. */
    job_usage_info *jui = pnode->nd_job_usages[i];
    strcpy(jobid, jui->jobid);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if ((pjob = svr_find_job(jobid, TRUE)) != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_val.at_str = strdup(rsv_id);
      pjob->ji_wattr[JOB_ATR_reservation_id].at_flags = ATR_VFLAG_SET;

      /* add environment variable BATCH_PARTITION_ID */
      char buf[1024];
      snprintf(buf, sizeof(buf), "BATCH_PARTITION_ID=%s", rsv_id);
      pbs_attribute  tempattr;
      clear_attr(&tempattr, &job_attr_def[JOB_ATR_variables]);
      job_attr_def[JOB_ATR_variables].at_decode(&tempattr,
        NULL, NULL, buf, 0);

      job_attr_def[JOB_ATR_variables].at_set(
        &pjob->ji_wattr[JOB_ATR_variables], &tempattr, INCR);

      job_attr_def[JOB_ATR_variables].at_free(&tempattr);

      track_alps_reservation(pjob);
      found_job = true;

      job_mutex.unlock(); 
      lock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    else
      lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (found_job == false)
    return(-1);

  return(PBSE_NONE);
  } /* END record_reservation() */
Example #25
0
void finish_move_process(

  char          *job_id,
  batch_request *preq,
  long           time,
  const char    *node_name,
  int            status,
  int            type,
  int            mom_err)

  {
  char  log_buf[LOCAL_LOG_BUF_SIZE+1];
  job  *pjob = svr_find_job(job_id, TRUE);

  if (pjob == NULL)
    {
    /* somehow the job has been deleted mid-runjob */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "Job %s was deleted while servicing move request", job_id);

    if (preq != NULL)
      {
      if (mom_err != PBSE_NONE)
        req_reject(mom_err, 0, preq, node_name, log_buf);
      else
        req_reject(PBSE_JOBNOTFOUND, 0, preq, node_name, log_buf);
      }
    }
  else
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);

    switch (type)
      {
      case MOVE_TYPE_Move:

        finish_moving_processing(pjob, preq, status);
        break;
        
      case MOVE_TYPE_Route:

        finish_routing_processing(pjob, status);
        break;
        
      case MOVE_TYPE_Exec:

        job_mutex.unlock();
        finish_sendmom(job_id, preq, time, node_name, status, mom_err);
        
        break;
      } /* END switch (type) */
    }

  } /* END finish_move_process() */
Example #26
0
int remove_job_from_exiting_list(

  job **pjob)

  {
  std::string jobid((*pjob)->ji_qs.ji_jobid);

  unlock_ji_mutex(*pjob,__func__, NULL, LOGLEVEL);
  int rc = remove_from_exiting_list_by_jobid(jobid.c_str());
  *pjob = svr_find_job((char *)jobid.c_str(),FALSE);
  return rc;
  } /* END remove_job_from_exiting_list() */
Example #27
0
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  int        job_state = -1;

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
      if ((poll_jobs) && (job_state == JOB_STATE_RUNNING))
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          current_poll_job_tasks++;
          pthread_mutex_unlock(poll_job_task_mutex);

          stat_mom_job(job_id);

          pthread_mutex_lock(poll_job_task_mutex);
          current_poll_job_tasks--;
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
Example #28
0
/* 
 * delete_whole_array()
 *
 * iterates over the array and deletes the whole thing
 * @param pa - the array to be deleted
 * @return - the number of jobs skipped
 */
int delete_whole_array(

  job_array *pa) /* I */

  {
  int i;
  int num_skipped = 0;
  int num_jobs = 0;
  int deleted;

  job *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      num_jobs++;
      if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
        {
        /* invalid state for request,  skip */
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        continue;
        }

      pthread_mutex_unlock(pa->ai_mutex);
      deleted = attempt_delete(pjob);

      if (deleted == FALSE)
        {
        /* if the job was deleted, this mutex would be taked care of elsewhere.
         * When it fails, release it here */
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
        num_skipped++;
        }

      pthread_mutex_lock(pa->ai_mutex);
      }
    }

  if (num_jobs == 0)
    return(NO_JOBS_IN_ARRAY);

  return(num_skipped);
  } /* END delete_whole_array() */
Example #29
0
char *get_next_retryable_jobid(

    exiting_jobs_info_iterator **iter)

  {
  job_exiting_retry_info *jeri;
  job                    *pjob;
  time_t                  time_now = time(NULL);
  char                    log_buf[LOCAL_LOG_BUF_SIZE];

  exiting_jobs_info.lock();
  if(*iter == NULL)
    {
    *iter = exiting_jobs_info.get_iterator();
    }

  while ((jeri = (*iter)->get_next_item()) != NULL)
    {
    if (time_now - jeri->last_attempt > EXITING_RETRY_TIME)
      {
      if (jeri->attempts >= MAX_EXITING_RETRY_ATTEMPTS)
        {
        std::string jid(jeri->jobid);
        exiting_jobs_info.remove(jeri->jobid);
        free(jeri);
        exiting_jobs_info.unlock();
        if ((pjob = svr_find_job((char *)jid.c_str(), TRUE)) != NULL)
          {
          snprintf(log_buf, sizeof(log_buf), "Job %s has had its exiting re-tried %d times, purging.",
            jeri->jobid, MAX_EXITING_RETRY_ATTEMPTS);
          log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);

          force_purge_work(pjob);
          }
        exiting_jobs_info.lock();
        }
      else
        {
        jeri->attempts++;
        jeri->last_attempt = time_now;
        exiting_jobs_info.unlock();

        char *jobid = strdup(jeri->jobid);
        return(jobid);
        }
      }
    }

  exiting_jobs_info.unlock();
  return(NULL);
  } /* END get_next_retryable_jobid() */
Example #30
0
int issue_signal(

  job  **pjob_ptr,
  char  *signame, /* name of the signal to send */
  void  (*func)(batch_request *),
  void  *extra) /* extra parameter to be stored in sig request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;

  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, 0);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */