Exemple #1
0
static void process_gpu_request_reply(

  struct work_task *pwt)
  {
  char   *id = "process_gpu_request_reply";

  struct batch_request *preq;

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = pwt->wt_parm1;
  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buffer,
      "MOM failed on GPU request, rc = %d",
      preq->rq_reply.brp_code);
    log_err(errno, id, log_buffer);

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer);
    }
  else
    {
    /* record that MOM changed gpu mode */
    if (LOGLEVEL >= 7)
      {
      sprintf(
        log_buffer,
        "GPU control request completed for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
        preq->rq_ind.rq_gpuctrl.rq_momnode,
        preq->rq_ind.rq_gpuctrl.rq_gpuid,
        preq->rq_ind.rq_gpuctrl.rq_gpumode,
        preq->rq_ind.rq_gpuctrl.rq_reset_perm,
        preq->rq_ind.rq_gpuctrl.rq_reset_vol);

      log_ext(-1, id, log_buffer, LOG_INFO);
      }

    reply_ack(preq);
    }
  }
void req_jobcredential(

  struct batch_request *preq)  /* ptr to the decoded request   */

  {
  job *pj;

  pj = locate_new_job(preq->rq_conn, NULL);

  if (pj == NULL)
    {
    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  reply_ack(preq);

  return;
  }  /* END req_jobcredential() */
Exemple #3
0
/*
 * process_gpu_request_reply
 * called when a gpu change request was sent to MOM and the answer
 * is received.  Completes the gpu request.
 */
void process_gpu_request_reply(

  batch_request *preq)

  {
  char log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buf,
      "MOM failed on GPU request, rc = %d",
      preq->rq_reply.brp_code);
    log_err(errno, __func__, log_buf);

    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buf);
    }
  else
    {
    /* record that MOM changed gpu mode */
    if (LOGLEVEL >= 7)
      {
      sprintf(
        log_buf,
        "GPU control request completed for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
        preq->rq_ind.rq_gpuctrl.rq_momnode,
        preq->rq_ind.rq_gpuctrl.rq_gpuid,
        preq->rq_ind.rq_gpuctrl.rq_gpumode,
        preq->rq_ind.rq_gpuctrl.rq_reset_perm,
        preq->rq_ind.rq_gpuctrl.rq_reset_vol);

      log_ext(-1, __func__, log_buf, LOG_INFO);
      }

    reply_ack(preq);
    }
  } /* END process_gpu_request_reply() */
int get_UID(
    
  int                   s, 
  char                 *munge_buf, 
  struct batch_request *preq)
  
  {
  char *ptr;
  char  user_name[PBS_MAXUSER];
  int   i = 0;


  ptr = strstr(munge_buf, "UID:");
	if (!ptr)
		{
		req_reject(PBSE_SYSTEM, 0, preq, NULL, "could not read unmunge data user");
		return(-1);
		}

	ptr = strchr(ptr, ':');
	ptr++;
	while (*ptr == SPACE)
	  {
	  ptr++;
	  }

	memset(user_name, 0, sizeof(user_name));

	while ((*ptr != SPACE) && 
         (!isspace(*ptr)) &&
         (i < (int)sizeof(user_name)))
	  {
	  user_name[i++] = *ptr;
	  ptr++;
	  }

	strncpy(conn_credent[s].username, user_name, sizeof(conn_credent[s].username) - 1);
        conn_credent[s].username[sizeof(conn_credent[s].username) - 1] = 0;
	
  return(PBSE_NONE);
  } /* END get_UID() */
void post_message_req(
    
  batch_request *preq)

  {
  char log_buf[LOCAL_LOG_BUF_SIZE];

  /* preq has been hadnled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

  sprintf(log_buf, msg_messagejob, preq->rq_reply.brp_code);
  log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, preq->rq_ind.rq_message.rq_jid, log_buf);

  if (preq->rq_reply.brp_code)
    req_reject(preq->rq_reply.brp_code, 0, preq, NULL, NULL);
  else
    reply_ack(preq);
  } /* END post_message_req() */
Exemple #6
0
void *modify_job_work(

  batch_request *vp) /* I */

  {
  job           *pjob;
  svrattrl      *plist;
  int            checkpoint_req = FALSE;
  batch_request *preq = (struct batch_request *)vp;
  
  pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE);

  if (pjob == NULL)
    {
    req_reject(PBSE_JOBNOTFOUND, 0, preq, NULL, "Job unexpectedly deleted");
    return(NULL);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);
  
  /* pbs_mom sets the extend string to trigger copying of checkpoint files */
  if (preq->rq_extend != NULL)
    {
    if (strcmp(preq->rq_extend,CHECKPOINTHOLD) == 0)
      {
      checkpoint_req = CHK_HOLD;
      }
    else if (strcmp(preq->rq_extend,CHECKPOINTCONT) == 0)
      {
      checkpoint_req = CHK_CONT;
      }
    }

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  /* modify_job will free preq and respond to it */
  modify_job((void **)&pjob, plist, preq, checkpoint_req, 0);

  return(NULL);
  } /* END modify_job_work() */
Exemple #7
0
static void process_checkpoint_reply(

  struct work_task *pwt)

  {
  job       *pjob;

  struct batch_request *preq;

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = get_remove_batch_request(pwt->wt_parm1);

  free(pwt->wt_mutex);
  free(pwt);

  /* preq handled previously */
  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = svr_find_job(preq->rq_ind.rq_manager.rq_objname, FALSE)) == NULL)
    {
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_manager.rq_objname,
      msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else
    {
    /* record that MOM has a checkpoint file */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */
    reply_ack(preq);

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }
  } /* END process_checkpoint_reply() */
Exemple #8
0
void
req_shutdown(struct batch_request *preq)
{
	int type;
	extern int shutdown_who;

	if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD |
		ATR_DFLAG_OPWR)) == 0) {
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	(void)sprintf(log_buffer, msg_shutdown_op, preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer);

	pshutdown_request = preq;    /* save for reply from main() when done */
	type = preq->rq_ind.rq_shutdown;
	shutdown_who = type & SHUT_WHO_MASK;

	if (shutdown_who & SHUT_WHO_SECDONLY)
		(void)failover_send_shutdown(FAILOVER_SecdShutdown);

	if (shutdown_who & SHUT_WHO_SCHED)
		(void)contact_sched(SCH_QUIT, NULL);	/* tell scheduler to quit */

	if (shutdown_who & SHUT_WHO_SECDONLY) {
		reply_ack(preq);
		return;			/* do NOT shutdown this Server */
	}

	/* Moms are told to shutdown in pbsd_main.c after main loop */

	svr_shutdown(type);
	return;
}
Exemple #9
0
void
req_stat_sched(struct batch_request *preq)
{
	svrattrl	   *pal;
	struct batch_reply *preply;
	int rc = 0;
	pbs_sched *psched;

	/* allocate a reply structure and a status sub-structure */

	preply = &preq->rq_reply;
	preply->brp_choice = BATCH_REPLY_CHOICE_Status;
	CLEAR_HEAD(preply->brp_un.brp_status);

	for (psched = (pbs_sched *) GET_NEXT(svr_allscheds);
			(psched != NULL);
			psched = (pbs_sched *) GET_NEXT(psched->sc_link)
		) {
		rc = status_sched(psched, preq, &preply->brp_un.brp_status);
		if (rc != 0) {
			break;
		}
	}

	if (!rc) {
		(void)reply_send(preq);
	} else {
		if (rc != PBSE_NOATTR)
			req_reject(rc, 0, preq);
		else {
			pal = (svrattrl *)GET_NEXT(preq->rq_ind.
				rq_status.rq_attr);
			reply_badattr(rc, bad, pal, preq);
		}
	}
}
Exemple #10
0
int handle_single_delete(

  batch_request *preq,
  batch_request *preq_tmp,
  char          *Msg)

  {
  char *jobid = preq->rq_ind.rq_delete.rq_objname;
  job  *pjob = svr_find_job(jobid, FALSE);

  if (pjob == NULL)
    {
    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID));
    
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job");
    }
  else
    {
    std::string jobID = pjob->ji_qs.ji_jobid;
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    removeBeforeAnyDependencies(jobID.c_str());


    /* send the asynchronous reply if needed */
    if (preq_tmp != NULL)
      {
      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      enqueue_threadpool_request(single_delete_work, preq, async_pool);
      }
    else
      single_delete_work(preq);
    }

  return(PBSE_NONE);
  } /* END handle_single_delete() */
void req_connect(

  struct batch_request *preq)

  {
  int  sock = preq->rq_conn;
  unsigned short conn_authen;

  /* Called from one location inside a lock */
  pthread_mutex_lock(svr_conn[sock].cn_mutex);
  conn_authen = svr_conn[sock].cn_authen;
  pthread_mutex_unlock(svr_conn[sock].cn_mutex);
  if (conn_authen == 0)
    {
    reply_ack(preq);
    }
  else
    {
    req_reject(PBSE_BADCRED, 0, preq, NULL, "Connection not authorized");
    }


  return;
  }  /* END req_connect() */
Exemple #12
0
int handle_single_delete(

  struct batch_request *preq,
  struct batch_request *preq_tmp,
  char                 *Msg)

  {
  int   rc= -1;
  char *jobid = preq->rq_ind.rq_delete.rq_objname;
  job  *pjob = svr_find_job(jobid, FALSE);

  if (pjob == NULL)
    {
    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobid,pbse_to_txt(PBSE_UNKJOBID));
    
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, "cannot locate job");
    }
  else
    {
    if (preq_tmp != NULL)
      {
      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      }
    
    /* mutex is freed below */
    if ((rc = forced_jobpurge(pjob, preq)) == PBSE_NONE)
      rc = execute_job_delete(pjob, Msg, preq);
    }
  
  if ((rc == PBSE_NONE) ||
      (rc == PURGE_SUCCESS))
    reply_ack(preq);

  return(PBSE_NONE);
  } /* END handle_single_delete() */
Exemple #13
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  all_jobs_iterator      *iter;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  {
  all_jobs *ajptr = NULL;

  if (type == tjstQueue)
    ajptr = cntl->sc_pque->qu_jobs;

  else if (type == tjstSummarizeArraysQueue)
    ajptr = cntl->sc_pque->qu_jobs_array_sum;

  else if (type == tjstSummarizeArraysServer)
    ajptr = &array_summary;

  else
    ajptr = &alljobs;

  ajptr->lock();
  iter = ajptr->get_iterator();
  ajptr->unlock();
  }

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }


  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    all_queues_iterator *iter = NULL;

    svr_queues.lock();
    iter = svr_queues.get_iterator();
    svr_queues.unlock();

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      all_jobs_iterator *jobiter = NULL;
      pque->qu_jobs->lock();
      jobiter = pque->qu_jobs->get_iterator();
      pque->qu_jobs->unlock();

      while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);

          delete iter;

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */

    reply_send_svr(preq);

    delete iter;

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          goto nextjob;
          }
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }

      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      delete iter;

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  delete iter;

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
Exemple #14
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Exemple #15
0
int handle_delete_all(

  struct batch_request *preq,
  struct batch_request *preq_tmp,
  char                 *Msg)

  {
  /* don't use the actual request so we can reply about all of the jobs */
  struct batch_request *preq_dup = duplicate_request(preq);
  job                  *pjob;
  int                   iter = -1;
  int                   failed_deletes = 0;
  int                   total_jobs = 0;
  int                   rc = PBSE_NONE;
  char                  tmpLine[MAXLINE];

  preq_dup->rq_noreply = TRUE;
  
  if (preq_tmp != NULL)
    {
    reply_ack(preq_tmp);
    preq->rq_noreply = TRUE; /* set for no more replies */
    }
  
  while ((pjob = next_job(&alljobs, &iter)) != NULL)
    {
    if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS)
      {
      continue;
      }

    if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      
      continue;
      }
    
    total_jobs++;
    
    /* mutex is freed below */
    if (rc == PBSE_NONE)
      {
      if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE)
        reply_ack(preq_dup);
       
      /* mark this as NULL because it has been freed */
      preq_dup = NULL;
      }
    
    if (rc != PURGE_SUCCESS)
      {
      /* duplicate the preq so we don't have a problem with double frees */
      preq_dup = duplicate_request(preq);
      preq_dup->rq_noreply = TRUE;
      
      if ((rc == MOM_DELETE) ||
          (rc == ROUTE_DELETE))
        failed_deletes++;
      }
    }
  
  if (failed_deletes == 0)
    {
    reply_ack(preq);

    /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() 
     * functions have been called */
    if (rc == PURGE_SUCCESS)
      {
      free_br(preq_dup);
      preq_dup = NULL;
      }
    }
  else
    {
    snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs",
      failed_deletes,
      total_jobs);
    
    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);
    }
    
  /* preq_dup happens at the end of the loop, so free the extra one if
   * it is there */
  if (preq_dup != NULL)
    free_br(preq_dup);

  return(PBSE_NONE);
  } /* END handle_delete_all() */
Exemple #16
0
int finalize_rerunjob(
    
  batch_request *preq,
  job           *pjob,
  int            rc)

  {
  int       Force;
  char      log_buf[LOCAL_LOG_BUF_SIZE];

  if (pjob == NULL)
    return(PBSE_BAD_PARAMETER);

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case -1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;

        if ((cray_enabled == true) &&
            (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
        else
          tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);

        /* Cannot communicate with MOM, forcibly requeue job.
           This is a relatively disgusting thing to do */

        sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
          tmp, rc);

        free(tmp);

        log_event(
          PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buf);

        log_err(-1, __func__, log_buf);

        strcat(log_buf, ", previous output files may be lost");

        svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);

        svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);

        rel_resc(pjob); /* free resc assigned to job */

        pjob->ji_modified = 1;    /* force full job save */

        pjob->ji_momhandle = -1;
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

        svr_evaljobstate(*pjob, newstate, newsubst, 0);
        svr_setjobstate(pjob, newstate, newsubst, FALSE);
        }

      break;
    }  /* END switch (rc) */

  pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
      ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
        JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;

  sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  reply_ack(preq);

  /* note in accounting file */
  account_record(PBS_ACCT_RERUN, pjob, NULL);

  return rc;
  }  /* END req_rerunjob() */
Exemple #17
0
void purge_completed_jobs(

  struct batch_request *preq)  /* I */

  {
  job          *pjob;
  char         *time_str;
  time_t        purge_time = 0;
  int           iter;
  char          log_buf[LOCAL_LOG_BUF_SIZE];

  /* get the time to purge the jobs that completed before */
  time_str = preq->rq_extend;
  time_str += strlen(PURGECOMP);
  purge_time = strtol(time_str,NULL,10);
  
  /*
    * Clean unreported capability is only for operators and managers.
    * Check if request is authorized
  */

  if ((preq->rq_perm & (ATR_DFLAG_OPRD|ATR_DFLAG_OPWR|
                    ATR_DFLAG_MGRD|ATR_DFLAG_MGWR)) == 0)
    {
    req_reject(PBSE_PERM,0,preq,NULL,
      "must have operator or manager privilege to use -c parameter");
    return;
    }
    
  reply_ack(preq);

  if (LOGLEVEL >= 4)
    {
    sprintf(log_buf,"Received purge completed jobs command, purge time is %ld (%s)",
      (long)purge_time, preq->rq_extend);

    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  iter = -1;

  while ((pjob = next_job(&alljobs,&iter)) != NULL) 
    {
    if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE) &&
        (pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long <= purge_time) &&
        ((pjob->ji_wattr[JOB_ATR_reported].at_flags & ATR_VFLAG_SET) != 0) &&
        (pjob->ji_wattr[JOB_ATR_reported].at_val.at_long == 0))
      {
      if (LOGLEVEL >= 4)
        {
        sprintf(log_buf,"Reported job is COMPLETED (%ld), setting reported to TRUE",
          pjob->ji_wattr[JOB_ATR_comp_time].at_val.at_long);
        
        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }
      
      pjob->ji_wattr[JOB_ATR_reported].at_val.at_long = 1;
      pjob->ji_wattr[JOB_ATR_reported].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
          
      job_save(pjob, SAVEJOB_FULL, 0); 
      }

    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
    }


  return;
  } /* END purge_completed_jobs() */
Exemple #18
0
void delay_and_send_sig_kill(
    
  batch_request *preq_sig)

  {
  int                   delay = 0;
  job                  *pjob;

  pbs_queue            *pque;

  batch_request        *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq_sig == NULL)
    return;

  rc = preq_sig->rq_reply.brp_code;

  if (preq_sig->rq_extend != NULL)
    {
    preq_clt = get_remove_batch_request(preq_sig->rq_extend);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL)
    {
    /* job has gone away, chk_job_request() calls req_reject() on failure */
    return;
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during rerun");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      pjob_mutex.unlock();
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  // Apply the user delay first so it takes precedence.
  if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
    delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
    mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

    if (delay == 0)
      {
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             0);
      }
    }
  else
    {
    /* why is the pque null. Something went wrong */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
    req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf);
    return;
    }

  pjob_mutex.unlock();
  reply_ack(preq_clt);
  set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE);
  } // END delay_and_send_sig_kill()
Exemple #19
0
int req_rerunjob(
   
  batch_request *preq)

  {
  int     rc = PBSE_NONE;
  job    *pjob;

  int     MgrRequired = TRUE;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if requestor is admin, job owner, etc */
  if (!strcasecmp(preq->rq_ind.rq_rerun, "all"))
    {
    return(handle_requeue_all(preq));
    }
  
  if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
    {
    /* FAILURE */

    /* chk_job_request calls req_reject() */

    rc = PBSE_SYSTEM;
    return rc; /* This needs to fixed to return an accurate error */
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  /* the job must be running or completed */

  if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
    {
    if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
      {
      /* allow end-users to rerun checkpointed jobs */

      MgrRequired = FALSE;
      }
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* job is running */

    /* NO-OP */
    }
  else if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
    {
    // If we are already queued, then there is nothing to do.
    rc = PBSE_NONE;
    reply_ack(preq);
    return(rc);
    }
  else
    {
    /* FAILURE - job is in bad state */
    rc = PBSE_BADSTATE;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s is in a bad state",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if ((MgrRequired == TRUE) &&
      ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0))
    {
    /* FAILURE */

    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "additional permissions required (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  /* the job must be rerunnable */

  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long == 0)
    {
    /* NOTE:  should force override this constraint? maybe (???) */
    /*          no, the user is saying that the job will break, and
                IEEE Std 1003.1 specifically says rerun is to be rejected
                if rerunable==FALSE -garrick */

    rc = PBSE_NORERUN;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "job %s not rerunnable",
        preq->rq_ind.rq_rerun);
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* ask MOM to kill off the job if it is running */
    int                 delay = 0;
    pbs_queue          *pque;
  
    // Apply the user delay first so it takes precedence.
    if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
      delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
      mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

      if (delay == 0)
        {
        delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                               &server.sv_attr[SRV_ATR_KillDelay],
                               0);
        }
      }
    else
      {
      /* why is the pque null. Something went wrong */
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
      req_reject(PBSE_UNKQUE, 0, preq, NULL, log_buf);
      return(PBSE_UNKQUE);
      }
    
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

    if (delay != 0)
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      get_batch_request_id(preq);
      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if ((preq->rq_extend) && 
          (!strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE))))
        {
        std::string extend = RERUNFORCE;
        batch_request *dup = new batch_request(*preq);
        get_batch_request_id(dup);
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(dup->rq_id.c_str()));

        if (rc == PBSE_NORELYMOM)
          {
          dup->rq_reply.brp_code = PBSE_NORELYMOM;
          pjob_mutex.unlock();
          post_rerun(dup);

          pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE);
          if (pjob == NULL)
            {
            delete dup;
            return(PBSE_NONE);
            }

          pjob_mutex.set_lock_state(true);
          rc = PBSE_NONE;
          }

        delete dup;
        }
      else
        {
        rc = issue_signal(&pjob, "SIGTERM", delay_and_send_sig_kill, extra, strdup(preq->rq_id.c_str()));
        if (rc != PBSE_NONE)
          {
          /* cant send to MOM */
          req_reject(rc, 0, preq, NULL, NULL);
          }

        return(rc);
        }
      }
    else
      {
      static const char *rerun = "rerun";
      char               *extra = strdup(rerun);

      /* If a qrerun -f is given requeue the job regardless of the outcome of issue_signal*/
      if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
        {
        std::string extend = RERUNFORCE;
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, strdup(extend.c_str()));
        if (rc == PBSE_NORELYMOM)
          rc = PBSE_NONE;
        }
      else
        rc = issue_signal(&pjob, "SIGKILL", post_rerun, extra, NULL);
      }
    }
  else
    { 
    if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == HOLD_n)
      {
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED, FALSE);
      }
    else
      {
      svr_setjobstate(pjob, JOB_STATE_HELD, JOB_SUBSTATE_HELD, FALSE);
      }

    /* reset some job attributes */
    
    pjob->ji_wattr[JOB_ATR_comp_time].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_wattr[JOB_ATR_reported].at_flags &= ~ATR_VFLAG_SET;

    set_statechar(pjob);

    rc = -1;
    }

  /* finalize_rerunjob will return with pjob->ji_mutex unlocked */
  pjob_mutex.set_unlock_on_exit(false);
  return finalize_rerunjob(preq,pjob,rc);
  }
int req_movejob(

  batch_request *req) /* I */

  {
  job       *jobp;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  int        local_errno = 0;

  jobp = chk_job_request(req->rq_ind.rq_move.rq_jid, req);

  if (jobp == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job_mutex(jobp->ji_mutex, true);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", jobp->ji_qs.ji_jobid);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  
  if ((jobp->ji_qs.ji_state != JOB_STATE_QUEUED) &&
      (jobp->ji_qs.ji_state != JOB_STATE_HELD) &&
      (jobp->ji_qs.ji_state != JOB_STATE_WAITING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d %s", pbse_to_txt(PBSE_BADSTATE), jobp->ji_qs.ji_state, __func__);

    log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }

  /*
   * svr_movejob() does the real work, handles both local and
   * network moves
   */
  
  /* We have found that sometimes the destination queue and the 
     parent queue are the same. If so we do not need to do
     anything else */
  if (strcmp(jobp->ji_qs.ji_queue, req->rq_ind.rq_move.rq_destin) == 0)
    {
    sprintf(log_buf, "Job %s already in queue %s", jobp->ji_qs.ji_jobid, jobp->ji_qs.ji_queue);
    if (LOGLEVEL >= 7)
      {
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      }
    
    req_reject(PBSE_JOB_ALREADY_IN_QUEUE, 0, req, NULL, log_buf);
    return(PBSE_NONE);
    }

  switch (svr_movejob(jobp, req->rq_ind.rq_move.rq_destin, &local_errno, req))
    {

    case 0:

      /* success */
      snprintf(log_buf, sizeof(log_buf), "%s", msg_movejob);
      snprintf(log_buf + strlen(log_buf), sizeof(log_buf) - strlen(log_buf), msg_manager,
        req->rq_ind.rq_move.rq_destin, req->rq_user, req->rq_host);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,jobp->ji_qs.ji_jobid,log_buf);

      reply_ack(req);

      break;

    case - 1:

    case 1:

      /* fail */

      /* NOTE:  can pass detailed response to requestor (NYI) */

      req_reject(local_errno, 0, req, NULL, NULL);

      break;

    case 2:

      /* deferred, will be handled by    */
      /* post_movejob() when the child completes */

      /* NO-OP */

      break;
    }  /* END switch (svr_movejob(jobp,req->rq_ind.rq_move.rq_destin,req)) */

  return(PBSE_NONE);
  }  /* END req_movejob() */
int req_orderjob(

  struct batch_request *vp) /* I */

  {
  job                  *pjob;
  job                  *pjob1;
  job                  *pjob2;
  int                   rank;
  int                   rc = 0;
  char                  tmpqn[PBS_MAXQUEUENAME+1];
  struct batch_request *req = (struct batch_request *)vp;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue            *pque1;
  pbs_queue            *pque2;

  if ((pjob1 = chk_job_request(req->rq_ind.rq_move.rq_jid, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job1_mutex(pjob1->ji_mutex, true);

  if ((pjob2 = chk_job_request(req->rq_ind.rq_move.rq_destin, req)) == NULL)
    {
    return(PBSE_NONE);
    }

  mutex_mgr job2_mutex(pjob2->ji_mutex, true);

  if (((pjob = pjob1)->ji_qs.ji_state == JOB_STATE_RUNNING) ||
      ((pjob = pjob2)->ji_qs.ji_state == JOB_STATE_RUNNING))
    {
#ifndef NDEBUG
    sprintf(log_buf, "%s %d",
            pbse_to_txt(PBSE_BADSTATE),
            pjob->ji_qs.ji_state);

    strcat(log_buf, __func__);

    log_event(
      PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
#endif /* NDEBUG */

    req_reject(PBSE_BADSTATE, 0, req, NULL, NULL);

    return(PBSE_NONE);
    }
  else if ((pjob1->ji_qhdr == NULL) || (pjob2->ji_qhdr == NULL))
    {
    req_reject(PBSE_BADSTATE, 0, req, NULL, "One of the jobs does not have a queue");
    return(PBSE_NONE);
    }
  else if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    /* jobs are in different queues */
    int ok = FALSE;

    if ((pque2 = get_jobs_queue(&pjob2)) == NULL)
      {
      rc = PBSE_BADSTATE;
      job2_mutex.set_lock_on_exit(false);
      }
    else
      {
      mutex_mgr pque2_mutex = mutex_mgr(pque2->qu_mutex, true);
      if ((rc = svr_chkque(pjob1, pque2, get_variable(pjob1, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
        {
        pque2_mutex.unlock();

        if ((pque1 = get_jobs_queue(&pjob1)) == NULL)
          {
          rc = PBSE_BADSTATE;
          job1_mutex.set_lock_on_exit(false);
          }
        else if (pjob1 != NULL)
          {
          mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
          if ((rc = svr_chkque(pjob2, pque1, get_variable(pjob2, pbs_o_host), MOVE_TYPE_Order, NULL)) == PBSE_NONE)
            {
            ok = TRUE;
            }
          }
        }
      }

    if (ok == FALSE)
      {
      req_reject(rc, 0, req, NULL, NULL);

      return(PBSE_NONE);
      }
    }

  /* now swap the order of the two jobs in the queue lists */
  rank = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long =
    pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long = rank;

  if (pjob1->ji_qhdr != pjob2->ji_qhdr)
    {
    strcpy(tmpqn, pjob1->ji_qs.ji_queue);
    strcpy(pjob1->ji_qs.ji_queue, pjob2->ji_qs.ji_queue);
    strcpy(pjob2->ji_qs.ji_queue, tmpqn);

    svr_dequejob(pjob1, FALSE);
    svr_dequejob(pjob2, FALSE);

    if (svr_enquejob(pjob1, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob1 = NULL;
      job1_mutex.set_lock_on_exit(false);
      }

    if (svr_enquejob(pjob2, FALSE, -1) == PBSE_JOB_RECYCLED)
      {
      pjob2 = NULL;
      job2_mutex.set_lock_on_exit(false);
      }
    }
  else
    {
    if ((pque1 = get_jobs_queue(&pjob1)) != NULL)
      {
      mutex_mgr pque1_mutex = mutex_mgr(pque1->qu_mutex, true);
      swap_jobs(pque1->qu_jobs,pjob1,pjob2);
      swap_jobs(NULL,pjob1,pjob2);
      }
    }

  /* need to update disk copy of both jobs to save new order */
  if (pjob1 != NULL)
    {
    job_save(pjob1, SAVEJOB_FULL, 0);
    }

  if (pjob2 != NULL)
    {
    job_save(pjob2, SAVEJOB_FULL, 0);
    }

  /* SUCCESS */
  reply_ack(req);

  return(PBSE_NONE);
  }  /* END req_orderjob() */
void req_deletearray(struct batch_request *preq)
  {
  job_array *pa;

  char *range;

  struct work_task *ptask;

  int num_skipped;
  char  owner[PBS_MAXUSER + 1];

  pa = get_array(preq->rq_ind.rq_delete.rq_objname);

  if (pa == NULL)
    {
    reply_ack(preq);
    return;
    }

  /* check authorization */
  get_jobowner(pa->ai_qs.owner, owner);

  if (svr_authorize_req(preq, owner, pa->ai_qs.submit_host) == -1)
    {
    sprintf(log_buffer, msg_permlog,
            preq->rq_type,
            "Array",
            preq->rq_ind.rq_delete.rq_objname,
            preq->rq_user,
            preq->rq_host);

    log_event(
      PBSEVENT_SECURITY,
      PBS_EVENTCLASS_JOB,
      preq->rq_ind.rq_delete.rq_objname,
      log_buffer);

    req_reject(PBSE_PERM, 0, preq, NULL, "operation not permitted");
    return;
    }

  /* get the range of jobs to iterate over */
  range = preq->rq_extend;
  if ((range != NULL) &&
      (strstr(range,ARRAY_RANGE) != NULL))
    {
    /* parse the array range */
    num_skipped = delete_array_range(pa,range);

    if (num_skipped < 0)
      {
      /* ERROR */

      req_reject(PBSE_IVALREQ,0,preq,NULL,"Error in specified array range");
      return;
      }
    }
  else
    {
    num_skipped = delete_whole_array(pa);
    }

  /* check if the array is gone */
  if ((pa = get_array(preq->rq_ind.rq_delete.rq_objname)) != NULL)
    {
    /* some jobs were not deleted.  They must have been running or had
       JOB_SUBSTATE_TRANSIT */
    if (num_skipped != 0)
      {
      ptask = set_task(WORK_Timed, time_now + 2, array_delete_wt, preq);
      if(ptask)
        {
        return;
        }
      }
    }

  /* now that the whole array is deleted, we should mail the user if necessary */

  reply_ack(preq);

  return;
  }
static void post_movejob(

  struct work_task *pwt)

  {
  char *id = "post_movejob";

  struct batch_request *req;
  int newstate;
  int newsub;
  int stat;
  int r;
  job *jobp;

  req  = (struct batch_request *)pwt->wt_parm2;

  stat = pwt->wt_aux;

  pbs_errno = PBSE_NONE;

  if (req->rq_type != PBS_BATCH_MoveJob)
    {
    sprintf(log_buffer, "bad request type %d\n",
            req->rq_type);

    log_err(-1, id, log_buffer);

    return;
    }

  jobp = find_job(req->rq_ind.rq_move.rq_jid);

  if ((jobp == NULL) || (jobp != (job *)pwt->wt_parm1))
    {
    sprintf(log_buffer, "job %s not found\n",
            req->rq_ind.rq_move.rq_jid);

    log_err(-1, id, log_buffer);
    }

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);

    if (r == 0)
      {
      /* purge server's job structure */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(jobp);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
        remove_checkpoint(jobp);

      strcpy(log_buffer, msg_movejob);

      sprintf(log_buffer + strlen(log_buffer), msg_manager,
              req->rq_ind.rq_move.rq_destin,
              req->rq_user,
              req->rq_host);

      job_purge(jobp);
      }
    else
      {
      r = PBSE_ROUTEREJ;
      }
    }
  else
    {
    r = PBSE_SYSTEM;

    sprintf(log_buffer, msg_badexit, stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  if (r)
    {
    if (jobp != NULL)
      {
      /* force re-eval of job state out of Transit */

      svr_evaljobstate(jobp, &newstate, &newsub, 1);
      svr_setjobstate(jobp, newstate, newsub);
      }

    req_reject(r, 0, req, NULL, NULL);
    }
  else
    {
    reply_ack(req);
    }

  return;
  }  /* END post_movejob() */
Exemple #24
0
void
req_relnodesjob(struct batch_request *preq)
{
	int             jt;		/* job type */
	job		*pjob;
	int		rc;
	char		*jid;
	int		i, offset;
	char		*nodeslist = NULL;
	char		msg[LOG_BUF_SIZE];

 
	if (preq == NULL)
		return;

	jid = preq->rq_ind.rq_relnodes.rq_jid;
	if (jid == NULL)
		return;

	/*
	 ** Returns job pointer for singleton job or "parent" of
	 ** an array job.
	 */
	pjob = chk_job_request(jid, preq, &jt);
	if (pjob == NULL) {
		return;
	}

	if (jt == IS_ARRAY_NO) {		/* a regular job is okay */
		/* the job must be running */
		if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) ||
			(pjob->ji_qs.ji_substate !=
			JOB_SUBSTATE_RUNNING)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}
	else if (jt == IS_ARRAY_Single) {	/* a single subjob is okay */

		offset = subjob_index_to_offset(pjob,
			get_index_from_jid(jid));
		if (offset == -1) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}

		i = get_subjob_state(pjob, offset);
		if (i == -1) {
			req_reject(PBSE_IVALREQ, 0, preq);
			return;
		}

		if (i != JOB_STATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
		if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	} else {
		reply_text(preq, PBSE_NOSUP,
			"not supported for Array Jobs or multiple sub-jobs");
		return;
	}

	nodeslist = preq->rq_ind.rq_relnodes.rq_node_list;

	if ((nodeslist != NULL) && (nodeslist[0] == '\0')) {
		nodeslist = NULL;
	}
	rc = free_sister_vnodes(pjob, nodeslist, msg, LOG_BUF_SIZE, preq);

	if (rc != 0) {
		reply_text(preq, PBSE_SYSTEM, msg);
	}
}
Exemple #25
0
int req_stat_node(

  struct batch_request *preq)

  {
  char                 *name;

  int                   rc   = PBSE_NONE;
  int                   type = 0;
  int                   bad  = 0;

  struct pbsnode       *pnode = NULL;
  struct batch_reply   *preply;
  struct prop props;
  svrattrl             *pal;

  /*
   * first, check that the server indeed has a list of nodes
   * and if it does, validate the name of the requested object--
   * either name is that of a specific node, or name[0] is null/@
   * meaning request is for all nodes in the server's jurisdiction
   */

  if (LOGLEVEL >= 6)
    {
    log_record( PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, "entered");
    }

  if (svr_totnodes <= 0)
    {
    rc = PBSE_NONODES;
    req_reject(rc, 0, preq, NULL, "node list is empty - check 'server_priv/nodes' file");

    return rc;
    }

  name = preq->rq_ind.rq_status.rq_id;

  if ((*name == '\0') || (*name == '@'))
    {
    type = 1;
    }
  else if ((*name == ':') && (*(name + 1) != '\0'))
    {
    if (!strcmp(name + 1, "ALL"))
      {
      type = 1;  /* psuedo-group for all nodes */
      }
    else
      {
      type = 2;
      props.name = name + 1;
      props.mark = 1;
      props.next = NULL;
      }
    }

  preply = &preq->rq_reply;

  preply->brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preply->brp_un.brp_status);

  if (type == 0)
    {
    /* get status of the named node */
    pnode = find_nodebyname(name);
    if (pnode == NULL)
      {
      rc = PBSE_UNKNODE;
      req_reject(rc, 0, preq, NULL, "cannot locate specified node");
      return(rc);
      }

    /* get the status on all of the numa nodes */
    if (pnode->nd_is_alps_reporter == TRUE)
      rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
    else
      rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);

    unlock_node(pnode, __func__, "type == 0", LOGLEVEL);
    }
  else
    {
    /* get status of all or several nodes */
    all_nodes_iterator *iter = NULL;

    while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL)
      {
      if ((type == 2) && 
          (!hasprop(pnode, &props)))
        {
        unlock_node(pnode, __func__, "type != 0, next_host", LOGLEVEL);
        continue;
        }

      /* get the status on all of the numa nodes */
      if (pnode->nd_is_alps_reporter == TRUE)
        rc = get_alps_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
      else
        rc = get_numa_statuses(pnode, preq, &bad, &preply->brp_un.brp_status);
      
      if (rc != PBSE_NONE)
        {
        unlock_node(pnode, __func__, "type != 0, rc != 0, get_numa_statuses", LOGLEVEL);
        break;
        }

      unlock_node(pnode, __func__, "type != 0, rc == 0, get_numa_statuses", LOGLEVEL);
      }

    if (iter != NULL)
      delete iter;
    }

  if (rc == PBSE_NONE)
    {
    /* SUCCESS */

    reply_send_svr(preq);
    }
  else
    {
    if (rc != PBSE_UNKNODEATR)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    else
      {
      pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

      reply_badattr(rc, bad, pal, preq);
      }
    }

  return(rc);
  }  /* END req_stat_node() */
Exemple #26
0
int req_deletejob(

  struct batch_request *preq)  /* I */

  {
  char                 *Msg = NULL;
  struct batch_request *preq_tmp = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  /* check if we are getting a purgecomplete from scheduler */
  if (preq->rq_extend != NULL)  
    {
    if (!strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP)))
      {
      /* purge_completed_jobs will respond with either an ack or reject */
      purge_completed_jobs(preq);
      
      return(PBSE_NONE);
      }
    else if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) &&
        strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) &&
        strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      /* have text message in request extension, add it */
      Msg = preq->rq_extend;

      /* Message capability is only for operators and managers.
       * Check if request is authorized */
      if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
                            ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
        {
        req_reject(PBSE_PERM, 0, preq, NULL,
          "must have operator or manager privilege to use -m parameter");

        return(PBSE_NONE);
        }
      }
    /* check if we are getting a asynchronous delete */
    else if (!strncmp(preq->rq_extend,delasyncstr,strlen(delasyncstr)))
      {
      /*
       * Respond with an ack now instead of after MOM processing
       * Create a new batch request and fill it in. It will be freed by reply_ack
       */
      snprintf(log_buf,sizeof(log_buf), "Deleting job asynchronously");
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_delete.rq_objname,log_buf);

      preq_tmp = duplicate_request(preq);
      }
    }

  if (strcasecmp(preq->rq_ind.rq_delete.rq_objname,"all") == 0)
    {
    handle_delete_all(preq, preq_tmp, Msg);
    }
  else
    {
    handle_single_delete(preq, preq_tmp, Msg);
    }

  return(PBSE_NONE);
  }  /* END req_deletejob() */
Exemple #27
0
int req_stat_job(

  struct batch_request *preq)  /* ptr to the decoded request */

  {
  struct stat_cntl     *cntl; /* see svrfunc.h  */
  char                 *name;
  job                  *pjob = NULL;
  pbs_queue            *pque = NULL;
  int                   rc = PBSE_NONE;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  enum TJobStatTypeEnum type = tjstNONE;

  /*
   * first, validate the name of the requested object, either
   * a job, a queue, or the whole server.
   */
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "note");
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }


  /* FORMAT:  name = { <JOBID> | <QUEUEID> | '' } */

  name = preq->rq_ind.rq_status.rq_id;

  if (preq->rq_extend != NULL)
    {
    /* evaluate pbs_job_stat() 'extension' field */

    if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated")))
      {
      /* truncate response by 'max_report' */

      type = tjstTruncatedServer;
      }
    else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays")))
      {
      type = tjstSummarizeArraysServer;
      }

    }    /* END if (preq->rq_extend != NULL) */

  if (isdigit((int)*name))
    {
    /* status a single job */

    if (is_array(name))
      {
      if (type != tjstSummarizeArraysServer)
        {
        type = tjstArray;
        }
      }
    else
      {
      type = tjstJob;

      if ((pjob = svr_find_job(name, FALSE)) == NULL)
        {
        rc = PBSE_UNKJOBID;
        }
      else
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    }
  else if (isalpha(name[0]))
    {
    if (type == tjstNONE)
      type = tjstQueue;
    else if (type == tjstSummarizeArraysServer)
      type = tjstSummarizeArraysQueue;
    else
      type = tjstTruncatedQueue;

    /* if found, this mutex is released later */
    if ((pque = find_queuebyname(name)) == NULL)
      {
      rc = PBSE_UNKQUE;
      }
    }
  else if ((*name == '\0') || (*name == '@'))
    {
    /* status all jobs at server */

    if (type == tjstNONE)
      type = tjstServer;
    }
  else
    {
    rc = PBSE_IVALREQ;
    }

  if (rc != 0)
    {
    /* is invalid - an error */
    req_reject(rc, 0, preq, NULL, NULL);

    return(rc);
    }

  preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preq->rq_reply.brp_un.brp_status);

  cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl));

  if (cntl == NULL)
    {
    if (pque != NULL) 
      unlock_queue(pque, "req_stat_job", (char *)"no memory cntl", LOGLEVEL);
    req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return(PBSE_SYSTEM);
    }

  if ((type == tjstTruncatedQueue) ||
      (type == tjstTruncatedServer))
    {
    if (pque != NULL)
      {
      unlock_queue(pque, __func__, "", LOGLEVEL);
      pque = NULL;
      }
    }

  cntl->sc_type   = (int)type;
  cntl->sc_conn   = -1;
  cntl->sc_pque   = pque;
  cntl->sc_origrq = preq;
  cntl->sc_post   = req_stat_job_step2;
  cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */

  req_stat_job_step2(cntl); /* go to step 2, see if running is current */

  if (pque != NULL)
    unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL);

  free(cntl);
  return(PBSE_NONE);
  }  /* END req_stat_job() */
Exemple #28
0
int req_stat_svr(

  struct batch_request *preq) /* ptr to the decoded request */

  {
  svrattrl             *pal;

  struct batch_reply   *preply;

  struct brp_status    *pstat;
  int                   bad = 0;
  char                  nc_buf[128];
  int                   numjobs;
  int                   netrates[3];

  memset(netrates, 0, sizeof(netrates));

  /* update count and state counts from sv_numjobs and sv_jobstates */
  lock_sv_qs_mutex(server.sv_qs_mutex, __func__);
  numjobs = server.sv_qs.sv_numjobs;
  unlock_sv_qs_mutex(server.sv_qs_mutex, __func__);
  
  pthread_mutex_lock(server.sv_attr_mutex);
  server.sv_attr[SRV_ATR_TotalJobs].at_val.at_long = numjobs;
  server.sv_attr[SRV_ATR_TotalJobs].at_flags |= ATR_VFLAG_SET;

  pthread_mutex_lock(server.sv_jobstates_mutex);

  update_state_ct(
    &server.sv_attr[SRV_ATR_JobsByState],
    server.sv_jobstates,
    server.sv_jobstbuf);
  
  pthread_mutex_unlock(server.sv_jobstates_mutex);

  netcounter_get(netrates);
  snprintf(nc_buf, 127, "%d %d %d", netrates[0], netrates[1], netrates[2]);

  if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL)
    free(server.sv_attr[SRV_ATR_NetCounter].at_val.at_str);
  server.sv_attr[SRV_ATR_NetCounter].at_val.at_str = strdup(nc_buf);
  if (server.sv_attr[SRV_ATR_NetCounter].at_val.at_str != NULL)
    server.sv_attr[SRV_ATR_NetCounter].at_flags |= ATR_VFLAG_SET;
  pthread_mutex_unlock(server.sv_attr_mutex);

  /* allocate a reply structure and a status sub-structure */

  preply = &preq->rq_reply;
  preply->brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preply->brp_un.brp_status);

  pstat = (struct brp_status *)calloc(1, sizeof(struct brp_status));

  if (pstat == NULL)
    {
    reply_free(preply);

    req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);
    pthread_mutex_unlock(server.sv_attr_mutex);

    return(PBSE_SYSTEM);
    }

  CLEAR_LINK(pstat->brp_stlink);

  strcpy(pstat->brp_objname, server_name);

  pstat->brp_objtype = MGR_OBJ_SERVER;

  CLEAR_HEAD(pstat->brp_attr);

  append_link(&preply->brp_un.brp_status, &pstat->brp_stlink, pstat);

  /* add attributes to the status reply */

  pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

  if (status_attrib(
        pal,
        svr_attr_def,
        server.sv_attr,
        SRV_ATR_LAST,
        preq->rq_perm,
        &pstat->brp_attr,
        &bad,
        1))    /* IsOwner == TRUE */
    {
    reply_badattr(PBSE_NOATTR, bad, pal, preq);
    }
  else
    {
    reply_send_svr(preq);
    }
    

  return(PBSE_NONE);
  }  /* END req_stat_svr() */
Exemple #29
0
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int                   delay = 0;
  int                   dellen = strlen(deldelaystr);
  job                  *pjob;

  pbs_queue            *pque;

  char                 *preq_clt_id;
  struct batch_request *preq_sig;         /* signal request to MOM */

  struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;
  preq_clt_id = preq_sig->rq_extra;

  free_br(preq_sig);

  if (preq_clt_id != NULL)
    {
    preq_clt = get_remove_batch_request(preq_clt_id);
    free(preq_clt_id);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE);

  if (pjob == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */
  if (delay == 0)
    {
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
      pthread_mutex_unlock(server.sv_attr_mutex);
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    else if (pjob != NULL)
      return;
    }

  set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE);

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */
  apply_job_delete_nanny(pjob, time_now + delay + 60);

  unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  }  /* END post_delete_mom1() */
Exemple #30
0
void
req_py_spawn(struct batch_request *preq)
{
	int             jt;		/* job type */
	job		*pjob;
	int		rc;
	char		*jid = preq->rq_ind.rq_py_spawn.rq_jid;
	int		i, offset;

	/*
	 ** Returns job pointer for singleton job or "parent" of
	 ** an array job.
	 */
	pjob = chk_job_request(jid, preq, &jt);
	if (pjob == NULL)
		return;

	/* see if requestor is the job owner */
	if (svr_chk_owner(preq, pjob) != 0) {
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	if (jt == IS_ARRAY_NO) {		/* a regular job is okay */
		/* the job must be running */
		if ((pjob->ji_qs.ji_state != JOB_STATE_RUNNING) ||
			(pjob->ji_qs.ji_substate !=
			JOB_SUBSTATE_RUNNING)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}
	else if (jt == IS_ARRAY_Single) {	/* a single subjob is okay */

		offset = subjob_index_to_offset(pjob,
			get_index_from_jid(jid));
		if (offset == -1) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}

		i = get_subjob_state(pjob, offset);
		if (i == -1) {
			req_reject(PBSE_IVALREQ, 0, preq);
			return;
		}

		if (i != JOB_STATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
		if ((pjob = pjob->ji_ajtrk->tkm_tbl[offset].trk_psubjob) == NULL) {
			req_reject(PBSE_UNKJOBID, 0, preq);
			return;
		}
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	} else {
		reply_text(preq, PBSE_NOSUP,
			"not supported for Array Jobs or multiple sub-jobs");
		return;
	}

	/*
	 ** Pass the request on to MOM.  If this works, the function
	 ** post_py_spawn_req will be called to handle the reply.
	 ** If it fails, send the reply now.
	 */
	rc = relay_to_mom(pjob, preq, post_py_spawn_req);
	if (rc)
		req_reject(rc, 0, preq);	/* unable to get to MOM */
}