Beispiel #1
0
void queue_route(

  pbs_queue *pque)

  {
  job *nxjb;
  job *pjob;
  int  rc;

  pjob = (job *)GET_NEXT(pque->qu_jobs);

  while (pjob != NULL)
    {
    nxjb = (job *)GET_NEXT(pjob->ji_jobque);

    if (pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now)
      {
      if ((rc = job_route(pjob)) == PBSE_ROUTEREJ)
        job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
      else if (rc == PBSE_ROUTEEXPD)
        job_abt(&pjob, msg_routexceed);
      }

    pjob = nxjb;
    }

  return;
  }
Beispiel #2
0
int reroute_job(

    job *pjob)

{
    int        rc = PBSE_NONE;
    char       log_buf[LOCAL_LOG_BUF_SIZE];

    if (LOGLEVEL >= 8)
    {
        sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid);
        LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

    rc = job_route(pjob);

    if (rc == PBSE_ROUTEREJ)
        job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
    else if (rc == PBSE_ROUTEEXPD)
        job_abt(&pjob, msg_routexceed);
    else if (rc == PBSE_QUENOEN)
        job_abt(&pjob, msg_err_noqueue);

    return(rc);
} /* END reroute_job() */
int reroute_job(

  job *pjob,
  pbs_queue *pque)

  {
  int        rc = PBSE_NONE;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

  if ((pque != NULL) &&
      (pque->qu_qs.qu_type == QTYPE_RoutePush))
    {
    rc = job_route(pjob);

    if (rc == PBSE_ROUTEREJ)
      job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
    else if (rc == PBSE_ROUTEEXPD)
      job_abt(&pjob, msg_routexceed);
    else if (rc == PBSE_QUENOEN)
      job_abt(&pjob, msg_err_noqueue);
    }

  return(rc);      
  } /* END reroute_job() */
Beispiel #4
0
END_TEST

START_TEST(job_abt_test)
  {
  int result = 0;
  struct job *null_job = NULL;

  result = job_abt(NULL, NULL);
  fail_unless(result != 0, "NULL input check fail");

  result = job_abt(&null_job, NULL);
  fail_unless(result != 0, "NULL input check fail");
  }
Beispiel #5
0
int set_array_job_ids(

  job  **pjob,       /* M */
  char  *log_buf,    /* error Buffer */
  size_t buflen)     /* error buffer length */

  {
  int rc = PBSE_NONE;
#ifndef PBS_MOM
  job *pj = *pjob;
  job_array *pa;
  char       parent_id[PBS_MAXSVRJOBID + 1];

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      snprintf(log_buf, buflen, "Array job missing array struct %s", __func__);
      return -1;
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }
#endif /* !PBS_MOM */
  return rc;
  }
Beispiel #6
0
static void close_quejob(

  int sfds)

  {
  job *pjob;
  job *npjob;

  pjob = (job *)GET_NEXT(svr_newjobs);

  while (pjob != NULL)
    {
    npjob = GET_NEXT(pjob->ji_alljobs);

    if (pjob->ji_qs.ji_un.ji_newt.ji_fromsock == sfds)
      {
      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_TRANSICM)
        {

#ifndef PBS_MOM

        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE)
          {
          /*
           * the job was being created here for the first time
           * go ahead and enqueue it as QUEUED; otherwise, hold
           * it here as TRANSICM until we hear from the sending
           * server again to commit.
           */

          delete_link(&pjob->ji_alljobs);

          pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
          pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;

          if (svr_enquejob(pjob))
            job_abt(&pjob, msg_err_noqueue);
          }

#endif /* PBS_MOM */

        }
      else
        {
        /* else delete the job */

        delete_link(&pjob->ji_alljobs);

        job_purge(pjob);
        }

      break;
      }  /* END if (..) */

    pjob = npjob;
    }

  return;
  }  /* END close_quejob() */
Beispiel #7
0
static void rerun_or_kill(

  job  *pjob,  /* I (modified/freed) */
  char *text)  /* I */

  {
  long server_state = server.sv_attr[(int)SRV_ATR_State].at_val.at_long;

  if (pjob->ji_wattr[(int)JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(pjob, "SIGKILL", release_req, 0);

    pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;

    strcpy(log_buffer, msg_init_queued);
    strcat(log_buffer, pjob->ji_qhdr->qu_qs.qu_name);
    strcat(log_buffer, text);
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */

    strcpy(log_buffer, msg_job_abort);
    strcat(log_buffer, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    job_abt(&pjob, log_buffer);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */

    strcpy(log_buffer, msg_leftrunning);
    strcat(log_buffer, text);
    }

  log_event(PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,

            PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
            log_buffer);

  return;
  }  /* END rerun_or_kill() */
Beispiel #8
0
int close_quejob_by_jobid(
    
  char *job_id)

  {
  int    rc = PBSE_NONE;
  job   *pjob = NULL;

  if (LOGLEVEL >= 10)
    {
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, job_id);
    }

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    {
    rc = PBSE_JOBNOTFOUND;
    return(rc);
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);
  if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    remove_job(&newjobs,pjob);
    svr_job_purge(pjob);
    pjob = NULL;
    }
  else if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE)
    {
    remove_job(&newjobs,pjob);
    pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
    pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;
    rc = svr_enquejob(pjob, FALSE, -1, false);

    if ((rc == PBSE_JOBNOTFOUND) ||
        (rc == PBSE_JOB_RECYCLED))
      {
      pjob = NULL;
      }
    else if (rc != PBSE_NONE)
      {
      job_abt(&pjob, msg_err_noqueue);
      pjob = NULL;
      }
    }

  if (pjob == NULL)
    pjob_mutex.set_lock_on_exit(false);

  return(rc);
  } /* close_quejob_by_jobid() */
Beispiel #9
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);
  char                 *msg_ptr = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preply = &preq->rq_reply;

  if (preply->brp_un.brp_txt.brp_str != NULL)
    {
    msg_ptr = strstr(preply->brp_un.brp_txt.brp_str, PBS_MSG_EQUAL);
  
    if (msg_ptr != NULL)
      msg_ptr += strlen(PBS_MSG_EQUAL);
    }

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        mutex_mgr job_mutex(pjob->ji_mutex, true);

        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else if ((preply->brp_choice == BATCH_REPLY_CHOICE_Text) &&
           (preply->brp_code == PBSE_UNKJOBID) &&
           (msg_ptr != NULL) &&
           (!strcmp(msg_ptr,  preq->rq_ind.rq_status.rq_id)))
    {
    /* we sent a stat request, but mom says it doesn't know anything about
       the job */
    if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
      {
      /* job really isn't running any more - mom doesn't know anything about it
         this can happen if a diskless node reboots and the mom_priv/jobs
         directory is cleared, set its state to queued so job_abt doesn't
         think it is still running */
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      snprintf(log_buf, sizeof(log_buf),
        "mother superior no longer recognizes %s as a valid job, aborting. Last reported time was %ld",
        preq->rq_ind.rq_status.rq_id, pjob->ji_last_reported_time);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      
      svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
      rel_resc(pjob);
      job_mutex.set_unlock_on_exit(false);
      job_abt(&pjob, "Job does not exist on node");

      /* TODO, if the job is rerunnable we should set its state back to queued */
      }
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Poll job request failed for job %s", preq->rq_ind.rq_status.rq_id);
    log_err(preply->brp_code, __func__, log_buf);
    }
  
  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
Beispiel #10
0
/**
 * attempt_delete()
 * deletes a job differently depending on the job's state
 *
 * @return TRUE if the job was deleted, FALSE if skipped
 * @param pjob - a pointer to the job being handled
 */
int attempt_delete(

  void *j) /* I */

  {
  int        skipped = FALSE;
  int        release_mutex = TRUE;

  job       *pjob;
  time_t     time_now = time(NULL);
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  /* job considered deleted if null */
  if (j == NULL)
    return(TRUE);

  pjob = (job *)j;

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* I'm not sure if this is still possible since the thread
     * waits on the job to finish transmiting, but I'll leave
     * this part here --dbeer */
    skipped = TRUE;
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */

  else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
    {
    /* we'll wait for the mom to get this job, then delete it */
    skipped = TRUE;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* set up nanny */
    
    if (pjob->ji_has_delete_nanny == FALSE)
      {
      apply_job_delete_nanny(pjob, time_now + 60);
      
      /* need to issue a signal to the mom, but we don't want to sent an ack to the
       * client when the mom replies */
      issue_signal(&pjob, "SIGTERM", post_delete, NULL);
      }

    if (pjob != NULL)
      {
      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
        {
        /* job has restart file at mom, change restart comment if failed */
        change_restart_comment_if_needed(pjob);
        }

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */    
    change_restart_comment_if_needed(pjob);
    
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    pjob->ji_momhandle = -1;
    
    /* force new connection */
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    
    remove_stagein(&pjob);
    
    if (pjob != NULL)
      job_abt(&pjob, NULL);

    release_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);
    
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    
    if (pjob != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
        &pque->qu_attr[QE_ATR_KeepCompleted],
        &server.sv_attr[SRV_ATR_KeepCompleted],
        0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      release_mutex = FALSE;
    }

  if (release_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

  return(!skipped);
  } /* END attempt_delete() */
Beispiel #11
0
void array_delete_wt(
    
  struct work_task *ptask)

  {
  struct batch_request *preq;
  job_array            *pa;

  int                   i;

  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  int                   num_jobs = 0;
  int                   num_prerun = 0;
  job                  *pjob;

  preq = get_remove_batch_request((char *)ptask->wt_parm1);
  
  free(ptask->wt_mutex);
  free(ptask);

  if (preq == NULL)
    return;

  pa = get_array(preq->rq_ind.rq_delete.rq_objname);

  if (pa == NULL)
    {
    /* jobs must have exited already */
    reply_ack(preq);

    return;
    }

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;
    
    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      num_jobs++;
      
      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
        {
        num_prerun++;
        /* mom still hasn't gotten job?? delete anyway */
        
        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
          {
          /* job has restart file at mom, do end job processing */
          change_restart_comment_if_needed(pjob);
          
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);
          
          pjob->ji_momhandle = -1;
          
          /* force new connection */
          if (LOGLEVEL >= 7)
            {
            sprintf(log_buf, "calling on_job_exit from %s", __func__);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
            }
          set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
          
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          }
        }
      else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
        {
        /* job has staged-in file, should remove them */
        remove_stagein(&pjob);
        
        if (pjob != NULL)
          {
          /* job_abt() calls svr_job_purge which will try to lock the array again */
          pthread_mutex_unlock(pa->ai_mutex);
          job_abt(&pjob, NULL);
          pthread_mutex_lock(pa->ai_mutex);
          }
        }
      else
        {
        /* job_abt() calls svr_job_purge which will try to lock the array again */
        pthread_mutex_unlock(pa->ai_mutex);
        job_abt(&pjob, NULL);
        pthread_mutex_lock(pa->ai_mutex);
        }
      } /* END if (ji_substate == JOB_SUBSTATE_PRERUN) */
    } /* END for each job in array */
  
  pthread_mutex_unlock(pa->ai_mutex);
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s: unlocked ai_mutex", __func__);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  
  if (num_jobs == num_prerun)
    {
    reply_ack(preq);
    }
  else
    {
    req_deletearray(preq);
    }

  } /* END array_delete_wt() */
Beispiel #12
0
/**
 * @brief
 * 		post_routejob - clean up action for child started in net_move/send_job
 *		   to "route" a job to another server
 * @par
 * 		If route was successfull, delete job.
 * @par
 * 		If route didn't work, mark destination not to be tried again for this
 * 		job and call route again.
 *
 * @param[in]	pwt	-	work task structure
 *
 * @return	none.
 */
static void
post_routejob(struct work_task *pwt)
{
	int	 newstate;
	int	 newsub;
	int	 r;
	int	 stat = pwt->wt_aux;
	job	*jobp = (job *)pwt->wt_parm2;

	if (jobp == NULL) {
		log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, "", "post_routejob failed, jobp NULL");
		return;
	}

	if (WIFEXITED(stat)) {
		r = WEXITSTATUS(stat);
	} else {
		r = SEND_JOB_FATAL;
		(void)sprintf(log_buffer, msg_badexit, stat);
		(void)strcat(log_buffer, __func__);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_NOTICE,
			jobp->ji_qs.ji_jobid, log_buffer);
	}

	switch (r) {
		case SEND_JOB_OK:		/* normal return, job was routed */

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
				remove_stagein(jobp);
			/*
			 * If the server is configured to keep job history and the job
			 * is created here, do not purge the job structure but save
			 * it for history purpose. No need to check for sub-jobs as
			 * sub jobs can not be routed.
			 */
			if (svr_chk_history_conf())
				svr_setjob_histinfo(jobp, T_MOV_JOB);
			else
				job_purge(jobp); /* need to remove server job struct */
			return;
		case SEND_JOB_FATAL:		/* permanent rejection (or signal) */
			if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT) {

				/* Job Delete in progress, just set to queued status */

				(void)svr_setjobstate(jobp, JOB_STATE_QUEUED,
					JOB_SUBSTATE_ABORT);
				return;
			}
			add_dest(jobp);		/* else mark destination as bad */
			/* fall through */
		default :	/* try routing again */
			/* force re-eval of job state out of Transit */
			svr_evaljobstate(jobp, &newstate, &newsub, 1);
			(void)svr_setjobstate(jobp, newstate, newsub);
			jobp->ji_retryok = 1;
			if ((r = job_route(jobp)) == PBSE_ROUTEREJ)
				(void)job_abt(jobp, msg_routebad);
			else if (r != 0)
				(void)job_abt(jobp, msg_routexceed);
			break;
	}
	return;
}
Beispiel #13
0
void stat_update(
    
  struct batch_request *preq,
  struct stat_cntl     *cntl)

  {
  job                  *pjob;
  struct batch_reply   *preply;
  struct brp_status    *pstatus;
  svrattrl             *sattrl;
  int                   oldsid;
  int                   bad = 0;
  time_t                time_now = time(NULL);

  preply = &preq->rq_reply;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = svr_find_job(pstatus->brp_objname, FALSE)) != NULL)
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL, 0);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL, 0);
          }
#endif    /* USESAVEDRESOURCES */

        pjob->ji_momstat = time_now;

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE)) != NULL)
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }
  cntl->sc_conn = -1;

  /* MUTSU - Unlock job here? */
  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */

  /* If sc_post has a value it is:
   * req_stat_job_step2
   * if so, it expects cntl to be free'd after the call
   */
  free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */
Beispiel #14
0
static void
post_doq(struct work_task *pwt)
{
	struct batch_request *preq = (struct batch_request *)pwt->wt_parm1;
	char *jobid = preq->rq_ind.rq_register.rq_child;
	char *msg;
	job  *pjob;
	job  *ppjob;
	struct depend_job pparent;
	int rc;

	if (preq->rq_reply.brp_code) {
		/* request was rejected */

		(void)strcpy(log_buffer, msg_regrej);
		(void)strcat(log_buffer, preq->rq_ind.rq_register.rq_parent);

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
			jobid, log_buffer);
		pjob = find_job(jobid);
		if ((msg = pbse_to_txt(preq->rq_reply.brp_code)) != NULL) {
			(void)strcat(log_buffer, " ");
			(void)strcat(log_buffer, msg);
		}
		if (pjob) {
			if (preq->rq_reply.brp_code == PBSE_JOB_MOVED) {
				/* Creating a separate log buffer because if we end up aborting the submitted job
				 * we don't want to change what goes into accounting log via job_abt
				 */
				char log_msg[LOG_BUF_SIZE];
				snprintf(log_msg, sizeof(log_msg), "%s, %s", msg_job_moved,
					"sending dependency request to remote server");
				log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobid, log_msg);
				ppjob = find_job(preq->rq_ind.rq_register.rq_parent);
				if(ppjob && (ppjob->ji_qs.ji_state == JOB_STATE_MOVED) && (ppjob->ji_qs.ji_substate == JOB_SUBSTATE_MOVED)) {
					char *destin;
					/* job destination should be <remote queue>@<remote server> */
					destin = strchr(ppjob->ji_qs.ji_destin, (int)'@');
					if (destin != NULL) {
						strncpy(pparent.dc_child, ppjob->ji_qs.ji_jobid, sizeof(pparent.dc_child));
						strncpy(pparent.dc_svr, destin+1, sizeof(pparent.dc_svr));
						rc = send_depend_req(pjob, &pparent, preq->rq_ind.rq_register.rq_dependtype,
							JOB_DEPEND_OP_REGISTER,
							SYNC_SCHED_HINT_NULL, post_doq);
						if (rc) {
							snprintf(log_msg, sizeof(log_msg), "%s",
								"Failed to send dependency request to remote server, aborting job");
							log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR, jobid, log_msg);
							check_block(pjob, log_buffer);
							job_abt(pjob, log_buffer);
						}
					}
					else {
						/* Ideally if a job is moved, destination can not be empty */
						/* If we come across an empty destination, abort the job */
						check_block(pjob, log_buffer);
						job_abt(pjob, log_buffer);
					}
				}
				else {
					check_block(pjob, log_buffer);
					job_abt(pjob, log_buffer);
				}
			}
			else {
				check_block(pjob, log_buffer);
				job_abt(pjob, log_buffer);
			}
		}
	}

	release_req(pwt);
}
Beispiel #15
0
void finish_routing_processing(

  job *pjob,
  int  status)

  {
  int          newstate;
  int          newsub;

  if (pjob == NULL)
    return;

  if (LOGLEVEL >= 10)
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid);

  switch (status)
    {
    case LOCUTION_SUCCESS:  /* normal return, job was routed */

      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(&pjob);

      if (pjob != NULL)
        {
        if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
          remove_checkpoint(&pjob);

        if (pjob != NULL)
          svr_job_purge(pjob); /* need to remove server job struct */
        }

      break;

    case LOCUTION_FAIL:  /* permanent rejection (or signal) */

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_ABORT)
        {
        /* job delete in progress, just set to queued status */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT, FALSE);
        
        svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return;
        }

      add_dest(pjob);  /* else mark destination as bad */

      /* fall through */

    default: /* try routing again */
       
      svr_mailowner(pjob, 'a', TRUE, "Couldn't route job to remote server");

      /* force re-eval of job state out of Transit */

      svr_evaljobstate(*pjob, newstate, newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);

      if ((status = job_route(pjob)) == PBSE_ROUTEREJ)
        job_abt(&pjob, pbse_to_txt(PBSE_ROUTEREJ));
      else if (status != 0)
        job_abt(&pjob, msg_routexceed);
      else
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);


      break;
    }  /* END switch (status) */

  return;
  } /* END finish_routing_processing() */
Beispiel #16
0
int delete_inactive_job(

  job        **pjob_ptr,
  const char  *Msg)

  {
  job  *pjob;
  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (pjob_ptr == NULL)
    return(PBSE_BAD_PARAMETER);

  pjob = *pjob_ptr;
  
  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int               KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      set_task(WORK_Timed, time(NULL) + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (pjob == NULL)
    *pjob_ptr = NULL;

  return(PBSE_NONE);
  } /* END delete_inactive_job() */
Beispiel #17
0
void rerun_or_kill(

  job  **pjob_ptr, /* I (modified/freed) */
  char  *text)     /* I */

  {
  long       server_state = SV_STATE_DOWN;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue *pque;
  job       *pjob = *pjob_ptr;

  get_svr_attr_l(SRV_ATR_State, &server_state);
  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(&pjob, "SIGKILL", free_br, NULL);

    if (pjob != NULL)
      {
      pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
      if ((pque = get_jobs_queue(&pjob)) != NULL)
        {
        snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text);

        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        }
      }
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);

    job_abt(pjob_ptr, log_buf);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text);
    }

  if (pjob != NULL)
    {
    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
    }

  return;
  }  /* END rerun_or_kill() */
Beispiel #18
0
static void post_routejob(

  struct work_task *pwt)

  {
  int  newstate;
  int  newsub;
  int  r;
  int  stat = pwt->wt_aux;
  char *id = "post_routejob";
  job *jobp = (job *)pwt->wt_parm1;

  if (WIFEXITED(stat))
    {
    r = WEXITSTATUS(stat);
    }
  else
    {
    r = 2;

    sprintf(log_buffer, msg_badexit,
            stat);

    strcat(log_buffer, id);

    log_event(
      PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      log_buffer);
    }

  switch (r)
    {
    case 0:  /* normal return, job was routed */

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
        remove_stagein(jobp);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_COPIED)
        remove_checkpoint(jobp);

      job_purge(jobp); /* need to remove server job struct */

      return;

      /*NOTREACHED*/

      break;

    case 1:  /* permanent rejection (or signal) */

      if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_ABORT)
        {
        /* job delete in progress, just set to queued status */

        svr_setjobstate(jobp, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT);

        return;
        }

      add_dest(jobp);  /* else mark destination as bad */

      /* fall through */

    default : /* try routing again */

      /* force re-eval of job state out of Transit */

      svr_evaljobstate(jobp, &newstate, &newsub, 1);
      svr_setjobstate(jobp, newstate, newsub);

      if ((r = job_route(jobp)) == PBSE_ROUTEREJ)
        job_abt(&jobp, pbse_to_txt(PBSE_ROUTEREJ));
      else if (r != 0)
        job_abt(&jobp, msg_routexceed);

      break;
    }  /* END switch (r) */

  return;
  }  /* END post_routejob() */
Beispiel #19
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job  *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];

#ifndef PBS_MOM
  char       parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "%s appears to be from an old version. Attempting to convert.\n",
      namebuf);

    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf);

      log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "3", LOGLEVEL);
      free(pj->ji_mutex);
#endif

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "4", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "5", LOGLEVEL);
    job_free(pj, FALSE);
#else
    mom_job_free(pj);
#endif


    close(fds);

    return(NULL);
    }

#ifndef PBS_MOM
  /* Comment out the mother superior tracking. Will be debugged later 
  if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {*/
    /* add job to the mother superior list for it's node */
/*    char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    char *end = strchr(ms, '/');

    if (end != NULL)
      *end = '\0';

    if ((end = strchr(ms, '+')) != NULL)
      *end = '\0';

    add_to_ms_list(ms, pj);

    free(ms);
    }*/
#endif

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        namebuf);

    log_err(-1, __func__, log_buf);
    }

#else /* not PBS_MOM */

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      close(fds);
      return NULL;
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }

#endif

  close(fds);

  pj->ji_commit_done = 1;

  /* all done recovering the job */

  job_save(pj, SAVEJOB_FULL, 0);

  return(pj);
  }  /* END job_recov() */
Beispiel #20
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  int    qs_upgrade;
#ifndef PBS_MOM
  char   parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  qs_upgrade = FALSE;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  strcpy(namebuf, path_jobs); /* job directory path */

  strcat(namebuf, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    sprintf(log_buffer, "unable to open %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    sprintf(log_buffer, "Unable to read %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    sprintf(log_buffer,
            "%s appears to be from an old version. Attempting to convert.\n",
            namebuf);
    log_err(-1, "job_recov", log_buffer);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      sprintf(log_buffer, "unable to upgrade %s\n", namebuf);

      log_err(-1, "job_recov", log_buffer);

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    qs_upgrade = TRUE;
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    sprintf(log_buffer, "Job Id %s does not match file name for %s",
            pj->ji_qs.ji_jobid,
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        (int)JOB_ATR_LAST,
        (int)JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    sprintf(log_buffer, "unable to recover %s (file is likely corrupted)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    job_free(pj);

    close(fds);

    return(NULL);
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

#else /* PBS_MOM */

  if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      pj->ji_arraystruct = pa;
      }
    else
      {
      /* XXX should we move this up after pa = get_array... */
      if (pa == NULL)
        {   
        job_abt(&pj, "Array job missing array struct, aborting job");
        close(fds);
        return NULL;
        }
      else
        {
        pa->jobs[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = (void *)pj;
        pj->ji_arraystruct = pa;
        pa->jobs_recovered++;
        }
      }
    }

#endif

  close(fds);

  /* all done recovering the job */

  if (qs_upgrade == TRUE)
    {
    job_save(pj, SAVEJOB_FULL);
    }

  return(pj);
  }  /* END job_recov() */
Beispiel #21
0
void req_deletejob(

  struct batch_request *preq)  /* I */

  {
  job              *pjob;

  struct work_task *pwtold;

  struct work_task *pwtnew;
  struct work_task *pwtcheck;

  int               rc;
  char             *sigt = "SIGTERM";

  char             *Msg = NULL;

  /* check if we are getting a purgecomplete from scheduler */
  if ((preq->rq_extend != NULL) && 
        !strncmp(preq->rq_extend,PURGECOMP,strlen(PURGECOMP)))
    {

    /*
     * purge_completed_jobs will respond with either an ack or reject
     */
    purge_completed_jobs(preq);

    return;
    }

  /* The way this is implemented, if the user enters the command "qdel -p <jobid>",
   * they can then delete jobs other than their own since the authorization
   * checks are made below in chk_job_request. This should probably be fixed.
   */

  if (forced_jobpurge(preq) != 0)
    {
    return;
    }

  /* NOTE:  should support rq_objname={<JOBID>|ALL|<name:<JOBNAME>} */

  /* NYI */

  pjob = chk_job_request(preq->rq_ind.rq_delete.rq_objname, preq);

  if (pjob == NULL)
    {
    /* NOTE:  chk_job_request() will issue req_reject() */

    return;
    }

  if (preq->rq_extend != NULL)
    {
    if (strncmp(preq->rq_extend, deldelaystr, strlen(deldelaystr)) &&
        strncmp(preq->rq_extend, delasyncstr, strlen(delasyncstr)) &&
        strncmp(preq->rq_extend, delpurgestr, strlen(delpurgestr)))
      {
      /* have text message in request extension, add it */

      Msg = preq->rq_extend;

      /*
       * Message capability is only for operators and managers.
       * Check if request is authorized
      */

      if ((preq->rq_perm & (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
                            ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)) == 0)
        {
        req_reject(PBSE_PERM, 0, preq, NULL,
                   "must have operator or manager privilege to use -m parameter");
        return;
        }
      }
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */

    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);

    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        pwtnew = set_task(
                   pwtold->wt_type,
                   pwtold->wt_event,
                   post_delete_route,
                   preq);

        if (pwtnew != NULL)
          {
          /*
           * reset type in case the SIGCHLD came
           * in during the set_task;  it makes
           * sure that next_task() will find the
           * new entry.
           */

          pwtnew->wt_type = pwtold->wt_type;
          pwtnew->wt_aux = pwtold->wt_aux;

          kill((pid_t)pwtold->wt_event, SIGTERM);

          pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;

          return; /* all done for now */
          }
        else
          {
          req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

          return;
          }
        }

      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    /* should never get here ...  */

    log_err(-1, "req_delete", "Did not find work task for router");

    req_reject(PBSE_INTERNAL, 0, preq, NULL, NULL);

    return;
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buffer, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    pwtnew = set_task(
               WORK_Timed,
               time_now + 1,
               post_delete_route,
               preq);

    if (pwtnew == 0)
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buffer, "requestor=%s@%s",
          preq->rq_user,
          preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */

  account_record(PBS_ACCT_DEL, pjob, log_buffer);

  sprintf(log_buffer, msg_manager,
          msg_deletejob,
          preq->rq_user,
          preq->rq_host);

  log_event(
    PBSEVENT_JOB,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    log_buffer);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */

    strcat(log_buffer, "\n");
    strcat(log_buffer, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      !has_job_delete_nanny(pjob))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buffer);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (has_job_delete_nanny(pjob))
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return;
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /* check if we are getting a asynchronous delete */

    if ((preq->rq_extend != NULL) &&
          !strncmp(preq->rq_extend,DELASYNC,strlen(DELASYNC)))
      {
      struct batch_request *preq_tmp = NULL;
      /*
       * Respond with an ack now instead of after MOM processing
       * Create a new batch request and fill it in. It will be freed by reply_ack
       */

      snprintf(log_buffer,sizeof(log_buffer), "Deleting job asynchronously");
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buffer);

      preq_tmp = alloc_br(PBS_BATCH_DeleteJob);
      preq_tmp->rq_perm = preq->rq_perm;
      preq_tmp->rq_ind.rq_manager.rq_cmd = preq->rq_ind.rq_manager.rq_cmd;
      preq_tmp->rq_ind.rq_manager.rq_objtype = preq->rq_ind.rq_manager.rq_objtype;
      preq_tmp->rq_fromsvr = preq->rq_fromsvr;
      preq_tmp->rq_extsz = preq->rq_extsz;
      preq_tmp->rq_conn = preq->rq_conn;
      memcpy(preq_tmp->rq_ind.rq_manager.rq_objname,
          preq->rq_ind.rq_manager.rq_objname, PBS_MAXSVRJOBID + 1);
      memcpy(preq_tmp->rq_user, preq->rq_user, PBS_MAXUSER + 1);
      memcpy(preq_tmp->rq_host, preq->rq_host, PBS_MAXHOSTNAME + 1);

      reply_ack(preq_tmp);
      preq->rq_noreply = TRUE; /* set for no more replies */
      }
  
    /* make a cleanup task if set */
    if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
      {
      pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
      if (pwtcheck != NULL)
        append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
      }

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */

    if ((rc = issue_signal(pjob, sigt, post_delete_mom1, preq)))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */

    sprintf(log_buffer, msg_delrunjobsig,
            sigt);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    return;
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  if ((server.sv_attr[SRV_ATR_JobForceCancelTime].at_flags & ATR_VFLAG_SET) &&
      (server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long > 0))
    {
    pwtcheck = set_task(
        WORK_Timed,
        time_now + server.sv_attr[SRV_ATR_JobForceCancelTime].at_val.at_long,
        ensure_deleted,
        preq);
    
    if (pwtcheck != NULL)
      append_link(&pjob->ji_svrtask, &pwtcheck->wt_linkobj, pwtcheck);
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  if ((server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = pjob->ji_arraystruct;

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->jobs[i] == NULL)
          continue;

        tmp = (job *)pa->jobs[i];

        if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
          {
          tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
              
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
            }
          
          svr_evaljobstate(tmp, &newstate, &newsub, 1);
          svr_setjobstate(tmp, newstate, newsub);
          job_save(tmp, SAVEJOB_FULL, 0);

          break;
          }
        }
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;

    /* force new connection */

    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);

    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(pjob);

    job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);

    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }

    KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);

    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  reply_ack(preq);

  return;
  }  /* END req_deletejob() */
void array_delete_wt(struct work_task *ptask)
  {

  struct batch_request *preq;
  job_array *pa;
  /*struct work_task *pnew_task;*/

  struct work_task *pwtnew;

  int i;

  static int last_check = 0;
  static char *last_id = NULL;

  preq = ptask->wt_parm1;

  pa = get_array(preq->rq_ind.rq_delete.rq_objname);

  if (pa == NULL)
    {
    /* jobs must have exited already */
    reply_ack(preq);
    last_check = 0;
    free(last_id);
    last_id = NULL;
    return;
    }

  if (last_id == NULL)
    {
    last_id = strdup(preq->rq_ind.rq_delete.rq_objname);
    last_check = time_now;
    }
  else if (strcmp(last_id, preq->rq_ind.rq_delete.rq_objname) != 0)
    {
    last_check = time_now;
    free(last_id);
    last_id = strdup(preq->rq_ind.rq_delete.rq_objname);
    }
  else if (time_now - last_check > 10)
    {
    int num_jobs;
    int num_prerun;
    job *pjob;

    num_jobs = 0;
    num_prerun = 0;

    for (i = 0; i < pa->ai_qs.array_size; i++)
      {
      if (pa->jobs[i] == NULL)
        continue;

      pjob = (job *)pa->jobs[i];

      num_jobs++;

      if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
        {
        num_prerun++;
        /* mom still hasn't gotten job?? delete anyway */

        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
          {
          /* job has restart file at mom, do end job processing */

          change_restart_comment_if_needed(pjob);

          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

          pjob->ji_momhandle = -1;

          /* force new connection */

          pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);

          if (pwtnew)
            {
            append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
            }

          }
        else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
          {
          /* job has staged-in file, should remove them */

          remove_stagein(pjob);

          job_abt(&pjob, NULL);
          }
        else
          {
          job_abt(&pjob, NULL);
          }

        }

      }

    if (num_jobs == num_prerun)
      {
      reply_ack(preq);
      free(last_id);
      last_id = NULL;
      return;
      }

    }



  req_deletearray(preq);


  }
/**
 * attempt_delete()
 * deletes a job differently depending on the job's state
 *
 * @return TRUE if the job was deleted, FALSE if skipped
 * @param pjob - a pointer to the job being handled
 */
int attempt_delete(

  void *j) /* I */

  {
  int skipped = FALSE;
  struct work_task *pwtold;
  struct work_task *pwtnew;
  job *pjob;

  /* job considered deleted if null */
  if (j == NULL)
    return(TRUE);

  pjob = (job *)j;

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /*
     * Find pid of router from existing work task entry,
     * then establish another work task on same child.
     * Next, signal the router and wait for its completion;
     */
    
    pwtold = (struct work_task *)GET_NEXT(pjob->ji_svrtask);
    
    while (pwtold != NULL)
      {
      if ((pwtold->wt_type == WORK_Deferred_Child) ||
          (pwtold->wt_type == WORK_Deferred_Cmp))
        {
        kill((pid_t)pwtold->wt_event, SIGTERM);
        
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_ABORT;
        }
      
      pwtold = (struct work_task *)GET_NEXT(pwtold->wt_linkobj);
      }

    skipped = TRUE;
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */

  else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
    {
    /* we'll wait for the mom to get this job, then delete it */
    skipped = TRUE;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* set up nanny */
    
    if (!has_job_delete_nanny(pjob))
      {
      apply_job_delete_nanny(pjob, time_now + 60);
      
      /* need to issue a signal to the mom, but we don't want to sent an ack to the
       * client when the mom replies */
      issue_signal(pjob, "SIGTERM", post_delete, NULL);
      }

    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
      {
      /* job has restart file at mom, change restart comment if failed */
      change_restart_comment_if_needed(pjob);
      }
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */    
    change_restart_comment_if_needed(pjob);
    
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING);

    pjob->ji_momhandle = -1;
    
    /* force new connection */
    pwtnew = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);
    
    if (pwtnew)
      {
      append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
      }
   
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    
    remove_stagein(pjob);
    
    job_abt(&pjob, NULL);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */

    struct work_task *ptask;
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE);
    
    if ((pque = pjob->ji_qhdr) && (pque != NULL))
      {
      pque->qu_numcompleted++;
      }
    
    KeepSeconds = attr_ifelse_long(
        &pque->qu_attr[(int)QE_ATR_KeepCompleted],
        &server.sv_attr[(int)SRV_ATR_KeepCompleted],
        0);
    ptask = set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, pjob);
    
    if (ptask != NULL)
      {
      append_link(&pjob->ji_svrtask, &ptask->wt_linkobj, ptask);
      }
    }

  return(!skipped);
  } /* END attempt_delete() */
Beispiel #24
0
void
req_register(struct batch_request *preq)
{
	int		   made;
	attribute	  *pattr;
	struct depend	  *pdep;
	struct depend_job *pdj;
	job		  *pjob;
	char		  *ps;
	struct work_task  *ptask;
	int		   rc = 0;
	int		   revtype;
	int		   type;
	int		   savetype = SAVEJOB_FULL;

	/*  make sure request is from a server */

	if (!preq->rq_fromsvr) {
#ifdef NAS /* localmod 109 */
		sprintf(log_buffer, "Dependency request not from server");
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
			preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	/* find the "parent" job specified in the request */

	if ((pjob = find_job(preq->rq_ind.rq_register.rq_parent)) == NULL) {

		/*
		 * job not found... if server is initializing, it may not
		 * yet recovered, that is not an error.
		 */

		if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long != SV_STATE_INIT) {
			log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
				preq->rq_ind.rq_register.rq_parent,
				msg_unkjobid);
			req_reject(PBSE_UNKJOBID, 0, preq);
		} else {
			reply_ack(preq);
		}
		return;
	}

	pattr = &pjob->ji_wattr[(int)JOB_ATR_depend];
	type = preq->rq_ind.rq_register.rq_dependtype;
	pjob->ji_modified = 1;

	/* more of the server:port fix kludge */

	ps = strchr(preq->rq_ind.rq_register.rq_child, (int)'@');
	if (ps != NULL) {
		(void)strcpy(preq->rq_ind.rq_register.rq_svr, ps+1);
		*ps = '\0';
	} else {
		(void)strcpy(preq->rq_ind.rq_register.rq_svr, preq->rq_host);
	}

	if (pjob->ji_qs.ji_state == JOB_STATE_MOVED) {
		snprintf(log_buffer, sizeof(log_buffer), "Parent %s%s", msg_movejob, pjob->ji_qs.ji_destin);
		log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR,
			PBS_EVENTCLASS_REQUEST, LOG_INFO,
			preq->rq_ind.rq_register.rq_child, log_buffer);
		req_reject(PBSE_JOB_MOVED, 0, preq);
		return;
	}
	switch (preq->rq_ind.rq_register.rq_op) {

			/*
			 * Register a dependency
			 */

		case JOB_DEPEND_OP_REGISTER:
			switch (type) {

				case JOB_DEPEND_TYPE_AFTERSTART:
					if (pjob->ji_qs.ji_substate >= JOB_SUBSTATE_RUNNING) {
						/* job already running, setup task to send	*/
						/* release back to child and continue with	*/
						/* registration process 			*/
						ptask = set_task(WORK_Immed, 0, post_run_depend,
							(void *)pjob);
						if (ptask)
							append_link(&pjob->ji_svrtask,
								&ptask->wt_linkobj, ptask);
					}
					/* fall through to complete registration */
				case JOB_DEPEND_TYPE_AFTERANY:
				case JOB_DEPEND_TYPE_AFTEROK:
				case JOB_DEPEND_TYPE_AFTERNOTOK:
					rc = register_dep(pattr, preq, type, &made);
					break;

				case JOB_DEPEND_TYPE_BEFORESTART:
				case JOB_DEPEND_TYPE_BEFOREANY:
				case JOB_DEPEND_TYPE_BEFOREOK:
				case JOB_DEPEND_TYPE_BEFORENOTOK:

					/*
					 * Check job owner for permission, use the real
					 * job owner, not the sending server's name.
					 */

					(void)strcpy(preq->rq_user,
						preq->rq_ind.rq_register.rq_owner);
					if (svr_chk_owner(preq, pjob)) {
						rc = PBSE_PERM;		/* not same user */
					} else {
						/* ok owner, see if job has "on" */
						pdep = find_depend(JOB_DEPEND_TYPE_ON, pattr);
						if (pdep == 0) {
							/* on "on", see if child already registered */
							revtype = type ^ (JOB_DEPEND_TYPE_BEFORESTART -
								JOB_DEPEND_TYPE_AFTERSTART);
							pdep = find_depend(revtype, pattr);
							if (pdep == 0) {
								/* no "on" and no prior - return error */
								rc = PBSE_BADDEPEND;
							} else {
								pdj = find_dependjob(pdep,
									preq->rq_ind.rq_register.rq_child);
								if (pdj) {
									/* has prior register, update it */
									(void)strcpy(pdj->dc_svr,
										preq->rq_ind.rq_register.rq_svr);
								}
							}
						} else if ((rc=register_dep(pattr, preq, type, &made)) == 0) {
							if (made) {	/* first time registered */
								if (--pdep->dp_numexp <= 0)
									del_depend(pdep);
							}
						}
					}
					break;

				default:
#ifdef NAS /* localmod 109 */
					sprintf(log_buffer, "Unknown dep. op: %d", preq->rq_ind.rq_register.rq_op);
					log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
						preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
					rc = PBSE_IVALREQ;
					break;
			}
			break;

			/*
			 * Release a dependency so job might run
			 */

		case JOB_DEPEND_OP_RELEASE:
			switch (type) {

				case JOB_DEPEND_TYPE_BEFORESTART:
				case JOB_DEPEND_TYPE_BEFOREANY:
				case JOB_DEPEND_TYPE_BEFOREOK:
				case JOB_DEPEND_TYPE_BEFORENOTOK:

					/* predecessor sent release-reduce "on", */
					/* see if this job can now run 		 */

					type ^= (JOB_DEPEND_TYPE_BEFORESTART -
						JOB_DEPEND_TYPE_AFTERSTART);
					if ((pdep = find_depend(type, pattr)) != NULL) {
						pdj = find_dependjob(pdep,
							preq->rq_ind.rq_register.rq_child);
						if (pdj) {
							del_depend_job(pdj);
							pattr->at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE;
							savetype = SAVEJOB_FULLFORCE;
							(void)sprintf(log_buffer, msg_registerrel,
								preq->rq_ind.rq_register.rq_child);
							log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
								LOG_INFO,
								pjob->ji_qs.ji_jobid, log_buffer);

							if (GET_NEXT(pdep->dp_jobs) == 0) {
								/* no more dependencies of this type */
								del_depend(pdep);
								set_depend_hold(pjob, pattr);
							}
							break;
						}
#ifdef NAS /* localmod 109 */
						sprintf(log_buffer, "Dep.rls. job not found: %d/%s", type, preq->rq_ind.rq_register.rq_child);
					} else {
						sprintf(log_buffer, "Dep.rls. type not found: %d", type);
#endif /* localmod 109 */
					}
#ifdef NAS /* localmod 109 */
					log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_INFO,
						preq->rq_ind.rq_register.rq_parent, log_buffer);
#endif /* localmod 109 */
					rc = PBSE_IVALREQ;
					break;

			}

			break;

		case JOB_DEPEND_OP_READY:
			rc = PBSE_NOSYNCMSTR;
			break;

		case JOB_DEPEND_OP_DELETE:
			(void)sprintf(log_buffer, msg_registerdel,
				preq->rq_ind.rq_register.rq_child);
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
			job_abt(pjob, log_buffer);
			break;

		case JOB_DEPEND_OP_UNREG:
			unregister_dep(pattr, preq);
			set_depend_hold(pjob, pattr);
			break;


		default:
			sprintf(log_buffer, msg_illregister,
				preq->rq_ind.rq_register.rq_parent);
			log_event(PBSEVENT_DEBUG|PBSEVENT_SYSTEM|PBSEVENT_ERROR,
				PBS_EVENTCLASS_REQUEST, LOG_INFO,
				preq->rq_host, log_buffer);
			rc = PBSE_IVALREQ;
			break;;
	}

	if (rc) {
		pjob->ji_modified = 0;
		req_reject(rc, 0, preq);
	} else {
		/* If this is an array job, forcibly save it to ensure
		 * dependencies are recorded.
		 */
		if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob)
			savetype = SAVEJOB_FULLFORCE;
		if (pjob->ji_modified)
			(void)job_save(pjob, savetype);
		reply_ack(preq);
	}
	return;
}
Beispiel #25
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Beispiel #26
0
static void stat_update(

  struct work_task *pwt)

  {

  struct stat_cntl     *cntl;
  job                  *pjob;

  struct batch_request *preq;

  struct batch_reply   *preply;

  struct brp_status    *pstatus;
  svrattrl        *sattrl;
  int    oldsid;

  preq = pwt->wt_parm1;
  preply = &preq->rq_reply;
  cntl = preq->rq_extra;

  if (preply->brp_choice == BATCH_REPLY_CHOICE_Status)
    {
    pstatus = (struct brp_status *)GET_NEXT(preply->brp_un.brp_status);

    while (pstatus != NULL)
      {
      if ((pjob = find_job(pstatus->brp_objname)))
        {
        sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr);

        oldsid = pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long;

        modify_job_attr(
          pjob,
          sattrl,
          ATR_DFLAG_MGWR | ATR_DFLAG_SvWR,
          &bad);

        if (oldsid != pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long)
          {
          /* first save since running job (or the sid has changed), */
          /* must save session id    */

          job_save(pjob, SAVEJOB_FULL);

          svr_mailowner(pjob, MAIL_BEGIN, MAIL_NORMAL, NULL);
          }

#ifdef USESAVEDRESOURCES
        else
          {
          /* save so we can recover resources used */
          job_save(pjob, SAVEJOB_FULL);
          }
#endif    /* USESAVEDRESOURCES */


        pjob->ji_momstat = time_now;
        }

      pstatus = (struct brp_status *)GET_NEXT(pstatus->brp_stlink);
      }  /* END while (pstatus != NULL) */
    }    /* END if (preply->brp_choice == BATCH_REPLY_CHOICE_Status) */
  else
    {
    if (preply->brp_code == PBSE_UNKJOBID)
      {
      /* we sent a stat request, but mom says it doesn't know anything about
         the job */
      if ((pjob = find_job(preq->rq_ind.rq_status.rq_id)))
        {
        /* job really isn't running any more - mom doesn't know anything about it
           this can happen if a diskless node reboots and the mom_priv/jobs
           directory is cleared, set its state to queued so job_abt doesn't
           think it is still running */
        svr_setjobstate(pjob, JOB_STATE_QUEUED, JOB_SUBSTATE_ABORT);
        rel_resc(pjob);
        job_abt(&pjob, "Job does not exist on node");

        /* TODO, if the job is rerunnable we should set its state back to queued */

        }
      }
    }

  release_req(pwt);

  cntl->sc_conn = -1;

  if (cntl->sc_post)
    cntl->sc_post(cntl); /* continue where we left off */
  else
    free(cntl); /* a bit of a kludge but its saves an extra func */

  return;
  }  /* END stat_update() */