Ejemplo n.º 1
0
int req_signaljob(

  void *vp)  /* I */

  {
  struct batch_request *preq = (struct batch_request *)vp;
  job                  *pjob;
  int                   rc;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      unlock_ji_mutex(pjob, __func__, (char *)"2", LOGLEVEL);
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    unlock_ji_mutex(pjob, __func__, (char *)"3", LOGLEVEL);
    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "relaying signal request to mom %lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    reply_ack(preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(rc, 0, preq, NULL, "can not allocate memory");
    unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, (char *)"4", LOGLEVEL);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */
Ejemplo n.º 2
0
int req_signaljob(

  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  batch_request *dup_req = NULL;

  /* preq free'd in error cases */
  if ((pjob = chk_job_request(preq->rq_ind.rq_signal.rq_jid, preq)) == 0)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

  /* Special pseudo signals for suspend and resume require op/mgr */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_RESUME) ||
      !strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND))
    {
    if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR | ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
      {
      /* for suspend/resume, must be mgr/op */
      req_reject(PBSE_PERM, 0, preq, NULL, NULL);
      
      return(PBSE_NONE);
      }
  
    }

  /* save job ptr for post_signal_req() */
  preq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

  /* FIXME: need a race-free check for available free subnodes before
   * resuming a suspended job */

#ifdef DONOTSUSPINTJOB
  /* interactive jobs don't resume correctly so don't allow a suspend */

  if (!strcmp(preq->rq_ind.rq_signal.rq_signame, SIG_SUSPEND) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long > 0))
    {
    req_reject(PBSE_JOBTYPE, 0, preq, NULL, NULL);

    return(PBSE_NONE);
    }

#endif

  if (LOGLEVEL >= 6)
    {
    char ipstr[128];

    sprintf(log_buf, "relaying signal request to mom %s", netaddr_long(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,ipstr));

    log_record(PBSEVENT_SCHED,PBS_EVENTCLASS_REQUEST,"req_signaljob",log_buf);
    }

  /* send reply for asynchronous suspend */
  if (preq->rq_type == PBS_BATCH_AsySignalJob)
    {
    /* reply_ack will free preq. We need to copy it before we call reply_ack */
    batch_request *new_preq;

    new_preq = duplicate_request(preq, -1);
    if (new_preq == NULL)
      {
      sprintf(log_buf, "failed to duplicate batch request");
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(PBSE_MEM_MALLOC);
      }

    get_batch_request_id(new_preq);

    reply_ack(new_preq);
    preq->rq_noreply = TRUE;
    }

  /* pass the request on to MOM */
  if ((dup_req = duplicate_request(preq)) == NULL)
    {
    req_reject(PBSE_SYSTEM, 0, preq, NULL, "can not allocate memory");
    }
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else 
    {
    rc = relay_to_mom(&pjob, dup_req, NULL);

    if (pjob != NULL)
      job_mutex.unlock();
    else
      job_mutex.set_unlock_on_exit(false);

    if (rc != PBSE_NONE)
      {
      free_br(dup_req);
      req_reject(rc, 0, preq, NULL, NULL);  /* unable to get to MOM */
      }
    else
      {
      post_signal_req(dup_req);
      free_br(preq);
      }
    }

  /* If successful we ack after mom replies to us, we pick up in post_signal_req() */

  return(PBSE_NONE);
  }  /* END req_signaljob() */