Exemple #1
0
int queue_cpy_del(struct queue_head *from, struct queue_head *to)
{
    lock_queue(from);
    lock_queue(to);

    to->head = from->head;
    to->tail = from->tail;
    from->head = (queue_body_t *) QUEUE_HEAD_NULL;

    from->tail = (queue_body_t *) QUEUE_TAIL_NULL;
    to->curLength = from->curLength;
    from->curLength = 0;

    unlock_queue(to);
    unlock_queue(from);
    return 0;
}
void gpu_worker(void *arg)
{
	hs_worker *worker_arg = (hs_worker *) arg;

	bind_to_cpu(worker_arg);

	init_cuda(worker_arg->device_id);

//	printf("GPU I am id:%d\n", worker_arg->worker_id);

	pthread_mutex_lock(&worker_arg->mutex);
	worker_arg->initialized = 1;
	pthread_cond_signal(&worker_arg->ready);
	pthread_mutex_unlock(&worker_arg->mutex);

	_task_t task;
	while (is_running())
	{
		lock_queue(worker_arg->task_queue);

		task = pop_task(worker_arg->task_queue);

		if (task == NULL)
		{
			if (is_running())
				sleep_worker(worker_arg);

			unlock_queue(worker_arg->task_queue);
			continue;
		}

		unlock_queue(worker_arg->task_queue);

		if ((task->task->arch_type & worker_arg->arch) != worker_arg->arch)
		{
			push_task(worker_arg->task_queue, task);
			continue;
		}

		execute_task(worker_arg, task);
	}

	deinit_cuda();

	pthread_exit((void*) 0);
}
Exemple #3
0
static work_q_item_t *
create_work_item(void)
{
	work_q_item_t *item;

	/* Try to reuse item from free item queue */
	lock_queue(&g_free_q);

	if ((item = (work_q_item_t *)sq_remfirst(&(g_free_q.q)))) {
		g_free_q.size--;
	}

	unlock_queue(&g_free_q);

	/* If we there weren't any free items then obtain memory for a new ones */
	if (item == NULL) {
		item = (work_q_item_t *)malloc(k_work_item_allocation_chunk_size * sizeof(work_q_item_t));

		if (item) {
			item->first = 1;
			lock_queue(&g_free_q);

			for (size_t i = 1; i < k_work_item_allocation_chunk_size; i++) {
				(item + i)->first = 0;
				sq_addfirst(&(item + i)->link, &(g_free_q.q));
			}

			/* Update the queue size and potentially the maximum queue size */
			g_free_q.size += k_work_item_allocation_chunk_size - 1;

			if (g_free_q.size > g_free_q.max_size) {
				g_free_q.max_size = g_free_q.size;
			}

			unlock_queue(&g_free_q);
		}
	}

	/* If we got one then lock the item*/
	if (item) {
		px4_sem_init(&item->wait_sem, 1, 0);        /* Caller will wait on this... initially locked */
	}

	/* return the item pointer, or NULL if all failed */
	return item;
}
Exemple #4
0
void save_queues()

  {
  struct pbs_queue *pque;
  int               iter = -1;

  while ((pque = next_queue(&svr_queues, &iter)) != NULL)
    {
    que_save(pque);
    unlock_queue(pque, __func__, NULL, 0);
    }
  } /* END save_queues() */
Exemple #5
0
pbs_queue *find_queuebyname(

  char *quename) /* I */

  {
  char      *pc;
  pbs_queue *pque = NULL;
  char       qname[PBS_MAXDEST + 1];
  char       log_buf[LOCAL_LOG_BUF_SIZE+1];
  int        i;

  snprintf(qname, sizeof(qname), "%s", quename);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", quename);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

  pc = strchr(qname, (int)'@'); /* strip off server (fragment) */

  if (pc != NULL)
    *pc = '\0';

  lock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL);

  i = get_value_hash(svr_queues.ht,qname);

  if (i >= 0)
    {
    pque = svr_queues.ra->slots[i].item;
    }
  
  if (pque != NULL)
    lock_queue(pque, __func__, NULL, LOGLEVEL);

  unlock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL);
  
  if (pque != NULL)
    {
    if (pque->q_being_recycled != FALSE)
      {
      unlock_queue(pque, __func__, "recycled queue", LOGLEVEL);
      pque = NULL;
      }
    }

  if (pc != NULL)
    *pc = '@'; /* restore '@' server portion */

  return(pque);
  }  /* END find_queuebyname() */
Exemple #6
0
static inline void
destroy_work_item(work_q_item_t *item)
{
	sem_destroy(&item->wait_sem); /* Destroy the item lock */
	/* Return the item to the free item queue for later reuse */
	lock_queue(&g_free_q);
	sq_addfirst(&item->link, &(g_free_q.q));

	/* Update the queue size and potentially the maximum queue size */
	if (++g_free_q.size > g_free_q.max_size)
		g_free_q.max_size = g_free_q.size;

	unlock_queue(&g_free_q);
}
Exemple #7
0
static inline work_q_item_t *
dequeue_work_item(void)
{
	work_q_item_t *work;

	/* retrieve the 1st item on the work queue */
	lock_queue(&g_work_q);

	if ((work = (work_q_item_t *)sq_remfirst(&g_work_q.q)))
		g_work_q.size--;

	unlock_queue(&g_work_q);
	return work;
}
Exemple #8
0
pbs_queue *find_queuebyname(

  char *quename) /* I */

  {
  char  *pc;
  pbs_queue *pque = NULL;
  char   qname[PBS_MAXDEST + 1];
  int    i;

  snprintf(qname, sizeof(qname), "%s", quename);

  pc = strchr(qname, (int)'@'); /* strip off server (fragment) */

  if (pc != NULL)
    *pc = '\0';

  pthread_mutex_lock(svr_queues.allques_mutex);

  i = get_value_hash(svr_queues.ht,qname);

  if (i >= 0)
    {
    pque = svr_queues.ra->slots[i].item;
    }
  
  if (pque != NULL)
    lock_queue(pque, __func__, NULL, LOGLEVEL);

  pthread_mutex_unlock(svr_queues.allques_mutex);
  
  if (pque != NULL)
    {
    if (pque->q_being_recycled != FALSE)
      {
      unlock_queue(pque, __func__, "recycled queue", LOGLEVEL);
      pque = NULL;
      }
    }

  if (pc != NULL)
    *pc = '@'; /* restore '@' server portion */

  return(pque);
  }  /* END find_queuebyname() */
Exemple #9
0
void que_free(

  pbs_queue *pq,
  int        sv_qs_mutex_held)

  {
  int            i;
  pbs_attribute *pattr;
  attribute_def *pdef;

  /* remove any calloc working pbs_attribute space */
  for (i = 0;i < QA_ATR_LAST;i++)
    {
    pdef  = &que_attr_def[i];
    pattr = &pq->qu_attr[i];

    pdef->at_free(pattr);

    /* remove any acl lists associated with the queue */

    if (pdef->at_type == ATR_TYPE_ACL)
      {
      pattr->at_flags |= ATR_VFLAG_MODIFY;

      save_acl(pattr, pdef, pdef->at_name, pq->qu_qs.qu_name);
      }
    }

  /* now free the main structure */
  if (sv_qs_mutex_held == FALSE)
    lock_sv_qs_mutex(server.sv_qs_mutex, __func__);
  server.sv_qs.sv_numque--;
  if (sv_qs_mutex_held == FALSE)
    unlock_sv_qs_mutex(server.sv_qs_mutex, __func__);

  free_user_info_holder(pq->qu_uih);

  remove_queue(&svr_queues, pq);
  pq->q_being_recycled = TRUE;
  insert_into_queue_recycler(pq);
  unlock_queue(pq, "que_free", NULL, LOGLEVEL);

  return;
  }  /* END que_free() */
Exemple #10
0
pbs_queue *lock_queue_with_job_held(

  pbs_queue  *pque,
  job       **pjob_ptr)

  {
  char       jobid[PBS_MAXSVRJOBID + 1];
  job       *pjob = *pjob_ptr;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  if (pque != NULL)
    {
    if (pthread_mutex_trylock(pque->qu_mutex))
      {
      /* if fail */
      strcpy(jobid, pjob->ji_qs.ji_jobid);
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      lock_queue(pque, __func__, NULL, LOGLEVEL);

      if ((pjob = svr_find_job(jobid, TRUE)) == NULL)
        {
        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        pque = NULL;
        *pjob_ptr = NULL;
        }
      }
    else
      {
      if (LOGLEVEL >= 10)
        {
        snprintf(log_buf, sizeof(log_buf), "try lock succeeded for queue %s on job %s", pque->qu_qs.qu_name, pjob->ji_qs.ji_jobid);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        }
      }
    }

  return(pque);
  } /* END lock_queue_with_job_held() */
Exemple #11
0
/**
 * attempt_delete()
 * deletes a job differently depending on the job's state
 *
 * @return TRUE if the job was deleted, FALSE if skipped
 * @param pjob - a pointer to the job being handled
 */
int attempt_delete(

  void *j) /* I */

  {
  int        skipped = FALSE;
  int        release_mutex = TRUE;

  job       *pjob;
  time_t     time_now = time(NULL);
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  /* job considered deleted if null */
  if (j == NULL)
    return(TRUE);

  pjob = (job *)j;

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* I'm not sure if this is still possible since the thread
     * waits on the job to finish transmiting, but I'll leave
     * this part here --dbeer */
    skipped = TRUE;
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_SUBSTATE_TRANSIT) */

  else if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)
    {
    /* we'll wait for the mom to get this job, then delete it */
    skipped = TRUE;
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

  else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /* set up nanny */
    
    if (pjob->ji_has_delete_nanny == FALSE)
      {
      apply_job_delete_nanny(pjob, time_now + 60);
      
      /* need to issue a signal to the mom, but we don't want to sent an ack to the
       * client when the mom replies */
      issue_signal(&pjob, "SIGTERM", post_delete, NULL);
      }

    if (pjob != NULL)
      {
      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
        {
        /* job has restart file at mom, change restart comment if failed */
        change_restart_comment_if_needed(pjob);
        }

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    
    return(!skipped);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */    
    change_restart_comment_if_needed(pjob);
    
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    pjob->ji_momhandle = -1;
    
    /* force new connection */
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    
    remove_stagein(&pjob);
    
    if (pjob != NULL)
      job_abt(&pjob, NULL);

    release_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);
    
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    
    if (pjob != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
        &pque->qu_attr[QE_ATR_KeepCompleted],
        &server.sv_attr[SRV_ATR_KeepCompleted],
        0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      release_mutex = FALSE;
    }

  if (release_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

  return(!skipped);
  } /* END attempt_delete() */
Exemple #12
0
int modify_job_attr(

  job      *pjob,  /* I (modified) */
  svrattrl *plist, /* I */
  int       perm,
  int      *bad)   /* O */

  {
  int            allow_unkn = -1;
  long           i;
  pbs_attribute  newattr[JOB_ATR_LAST];
  pbs_attribute *pattr;
  int            rc;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue     *pque;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    if (pque->qu_qs.qu_type != QTYPE_Execution)
      allow_unkn = JOB_ATR_UNKN;

    unlock_queue(pque, __func__, NULL, LOGLEVEL);
    }
  else if (pjob->ji_parent_job != NULL)
    {
    allow_unkn = JOB_ATR_UNKN;
    }
  else
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 5");
    return(PBSE_JOBNOTFOUND);
    }

  pattr = pjob->ji_wattr;

  /* call attr_atomic_set to decode and set a copy of the attributes */

  rc = attr_atomic_set(
         plist,        /* I */
         pattr,        /* I */
         newattr,      /* O */
         job_attr_def, /* I */
         JOB_ATR_LAST,
         allow_unkn,   /* I */
         perm,         /* I */
         bad);         /* O */

  /* if resource limits are being changed ... */

  if ((rc == 0) &&
      (newattr[JOB_ATR_resource].at_flags & ATR_VFLAG_SET))
    {
    if ((perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)
      {
      /* If job is running, only manager/operator can raise limits */

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        {
        long lim = TRUE;
        int comp_resc_lt;
       
        get_svr_attr_l(SRV_ATR_QCQLimits, &lim);
        comp_resc_lt = comp_resc2(&pjob->ji_wattr[JOB_ATR_resource],
                                      &newattr[JOB_ATR_resource],
                                      lim,
                                      NULL,
                                      LESS);

        if (comp_resc_lt != 0)
          {
          rc = PBSE_PERM;
          }
        }

      /* Also check against queue and system limits */

      if (rc == 0)
        {
        if ((pque = get_jobs_queue(&pjob)) != NULL)
          {
          rc = chk_resc_limits( &newattr[JOB_ATR_resource], pque, NULL);
          unlock_queue(pque, __func__, NULL, LOGLEVEL);
          }
        else if (pjob == NULL)
          {
          log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 6");
          return(PBSE_JOBNOTFOUND);
          }
        else
          rc = PBSE_QUENOTAVAILABLE;
        }
      }
    }    /* END if ((rc == 0) && ...) */

  /* special check on permissions for hold */

  if ((rc == 0) &&
      (newattr[JOB_ATR_hold].at_flags & ATR_VFLAG_MODIFY))
    {
    i = newattr[JOB_ATR_hold].at_val.at_long ^
        (pattr + JOB_ATR_hold)->at_val.at_long;

    rc = chk_hold_priv(i, perm);
    }

  if (rc == 0)
    {
    for (i = 0;i < JOB_ATR_LAST;i++)
      {
      if (newattr[i].at_flags & ATR_VFLAG_MODIFY)
        {
        if (job_attr_def[i].at_action)
          {
          rc = job_attr_def[i].at_action(
                 &newattr[i],
                 pjob,
                 ATR_ACTION_ALTER);

          if (rc)
            break;
          }
        }
      }    /* END for (i) */

    if ((rc == 0) &&
        ((newattr[JOB_ATR_userlst].at_flags & ATR_VFLAG_MODIFY) ||
         (newattr[JOB_ATR_grouplst].at_flags & ATR_VFLAG_MODIFY)))
      {
      /* need to reset execution uid and gid */

      rc = set_jobexid(pjob, newattr, NULL);
      }

    if ((rc == 0) &&
        (newattr[JOB_ATR_outpath].at_flags & ATR_VFLAG_MODIFY))
      {
      /* need to recheck if JOB_ATR_outpath is a special case of host only */

      if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == ':')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_outpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'o');

        /* don't call free_dynamic_string() */
        free(ds);
        }
      /*
       * if the output path was specified and ends with a '/'
       * then append the standard file name
       */
      else if (newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] == '/')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_outpath].at_val.at_str[strlen(newattr[JOB_ATR_outpath].at_val.at_str) - 1] = '\0';
        
        replace_attr_string(&newattr[JOB_ATR_outpath],
          (add_std_filename(pjob, newattr[JOB_ATR_outpath].at_val.at_str, (int)'o', ds)));

        /* don't call free_dynamic_string because() we still want to use the allocated string */
        free(ds);
        }
      }

    if ((rc == 0) &&
        (newattr[JOB_ATR_errpath].at_flags & ATR_VFLAG_MODIFY))
      {
      /* need to recheck if JOB_ATR_errpath is a special case of host only */

      if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == ':')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_errpath].at_val.at_str = prefix_std_file(pjob, ds, (int)'e');

        /* don't call free_dynamic_string() */
        free(ds);
        }
      /*
       * if the error path was specified and ends with a '/'
       * then append the standard file name
       */
      else if (newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] == '/')
        {
        dynamic_string *ds = get_dynamic_string(-1, NULL);
        newattr[JOB_ATR_errpath].at_val.at_str[strlen(newattr[JOB_ATR_errpath].at_val.at_str) - 1] = '\0';
        
        replace_attr_string(&newattr[JOB_ATR_errpath],
          (add_std_filename(pjob, newattr[JOB_ATR_errpath].at_val.at_str,(int)'e', ds)));

        /* don't call free_dynamic_string() */
        free(ds);
        }
      }

    }  /* END if (rc == 0) */

  if (rc != 0)
    {
    for (i = 0;i < JOB_ATR_LAST;i++)
      job_attr_def[i].at_free(newattr + i);

    /* FAILURE */

    return(rc);
    }  /* END if (rc != 0) */

  /* OK, now copy the new values into the job attribute array */

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    if (newattr[i].at_flags & ATR_VFLAG_MODIFY)
      {
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "attr %s modified", job_attr_def[i].at_name);

        log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
        }

      job_attr_def[i].at_free(pattr + i);

      if ((newattr[i].at_type == ATR_TYPE_LIST) ||
          (newattr[i].at_type == ATR_TYPE_RESC))
        {
        list_move(
          &newattr[i].at_val.at_list,
          &(pattr + i)->at_val.at_list);
        }
      else
        {
        *(pattr + i) = newattr[i];
        }

      (pattr + i)->at_flags = newattr[i].at_flags;
      }
    }    /* END for (i) */

  /* note, the newattr[] attributes are on the stack, they go away automatically */

  pjob->ji_modified = 1;

  return(0);
  }  /* END modify_job_attr() */
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  int                    iter;
  time_t                 time_now = time(NULL);
  long                   poll_jobs = 0;
  char                   job_id[PBS_MAXSVRJOBID+1];
  int                    job_substate = -1;
  time_t                 job_momstattime = -1;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  iter = -1;

  get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
  if (!poll_jobs)
    {
    /* polljobs not set - indicates we may need to obtain fresh data from
       MOM */

    if (cntl->sc_jobid[0] == '\0')
      pjob = NULL;
    else
      pjob = svr_find_job(cntl->sc_jobid, FALSE);

    while (1)
      {
      if (pjob == NULL)
        {
        /* start from the first job */

        if (type == tjstJob)
          {
          pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);
          }
        else if (type == tjstQueue)
          {
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
          }
        else if (type == tjstArray)
          {
          job_array_index = 0;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
                break;
                }
              }

            job_array_index++;
            }
          }
        else
          {
          pjob = next_job(&alljobs,&iter);
          }

        }    /* END if (pjob == NULL) */
      else
        {
        strcpy(job_id, pjob->ji_qs.ji_jobid);
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

        if (type == tjstJob)
          break;

        if (type == tjstQueue)
          pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
        else if (type == tjstArray)
          {
          pjob = NULL;
          /* increment job_array_index until we find a non-null pointer or hit the end */
          while (++job_array_index < pa->ai_qs.array_size)
            {
            if (pa->job_ids[job_array_index] != NULL)
              {
              if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
                {
                unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
                break;
                }
              }
            }
          }
        else
          pjob = next_job(&alljobs,&iter);
          
        }

      if (pjob == NULL)
        break;

      strcpy(job_id, pjob->ji_qs.ji_jobid);
      job_substate = pjob->ji_qs.ji_substate;
      job_momstattime = pjob->ji_momstat;
      strcpy(cntl->sc_jobid, job_id);
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      pjob = NULL;

      /* PBS_RESTAT_JOB defaults to 30 seconds */
      if ((job_substate == JOB_SUBSTATE_RUNNING) &&
          ((time_now - job_momstattime) > JobStatRate))
        {
        /* go to MOM for status */
        if ((rc = stat_to_mom(job_id, cntl)) == PBSE_MEM_MALLOC)
          break;

        if (rc != 0)
          {
          pjob = svr_find_job(job_id, FALSE);

          rc = 0;

          continue;
          }
        
        if (pa != NULL)
          unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

        return; /* will pick up after mom replies */
        }
      }    /* END while(1) */

    if (rc != 0)
      {
      if (pa != NULL)
        unlock_ai_mutex(pa, __func__, "2", LOGLEVEL);

      reply_free(preply);

      req_reject(rc, 0, preq, NULL, "cannot get update from mom");

      return;
      }
    }    /* END if (!server.sv_attr[SRV_ATR_PollJobs].at_val.at_long) */

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }

  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,&iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,&iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,&iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }


  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    int  iter = -1;

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      iter = -1;

      while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          if (pa != NULL)
            {
            unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
            }
          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);
          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */
      
    if (pa != NULL)
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);

    reply_send_svr(preq);

    return;
    }        /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          unlock_queue(pque, __func__, "not exec", LOGLEVEL);
        
          goto nextjob;
          }

        unlock_queue(pque, __func__, "exec", LOGLEVEL);
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,&iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,&iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,&iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,&iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
int req_stat_que(
    struct batch_request *preq)
  {
  char                 *name;
  pbs_queue            *pque = NULL;

  struct batch_reply   *preply;
  int                   rc   = 0;
  int                   type = 0;
  char log_buf[LOCAL_LOG_BUF_SIZE+1];

  /*
   * first, validate the name of the requested object, either
   * a queue, or null for all queues
   */

  name = preq->rq_ind.rq_status.rq_id;

  if ((*name == '\0') || (*name == '@'))
    {
    type = 1;
    }
  else
    {
    pque = find_queuebyname(name);

    if (pque == NULL)
      {
      rc = PBSE_UNKQUE;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "can not locate queue %s", name);
      req_reject(rc, 0, preq, NULL, log_buf);
      return rc;
      }
    }

  preply = &preq->rq_reply;

  preply->brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preply->brp_un.brp_status);

  if (type == 0)
    {
    /* get status of the named queue */

    rc = status_que(pque, preq, &preply->brp_un.brp_status);
    unlock_queue(pque, "req_stat_que", "type == 0", LOGLEVEL);
    }
  else
    {
    /* pque == NULL before next_queue */
    int iter = -1;

    /* get status of all queues */
    while ((pque = next_queue(&svr_queues,&iter)) != NULL)
      {
      rc = status_que(pque, preq, &preply->brp_un.brp_status);

      if (rc != 0)
        {
        if (rc != PBSE_PERM)
          {
          unlock_queue(pque, "req_stat_que", "break", LOGLEVEL);
          break;
          }

        rc = 0;
        }

      unlock_queue(pque, "req_stat_que", "end while", LOGLEVEL);
      }
    }

  if (rc != PBSE_NONE)
    {
    reply_free(preply);

    req_reject(PBSE_NOATTR, rc, preq, NULL, "status_queue failed");
    }
  else
    {
    reply_send_svr(preq);
    }

  return rc;
  }  /* END req_stat_que() */
Exemple #15
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Exemple #16
0
int get_parent_dest_queues(

  char       *queue_parent_name,
  char       *queue_dest_name,
  pbs_queue **parent,
  pbs_queue **dest,
  job       **pjob_ptr)

  {
  pbs_queue *pque_parent;
  pbs_queue *pque_dest;
  char       jobid[PBS_MAXSVRJOBID + 1];
  char       log_buf[LOCAL_LOG_BUF_SIZE + 1];
  job       *pjob = *pjob_ptr;
  int        index_parent;
  int        index_dest;
  int        rc = PBSE_NONE;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "%s", pjob->ji_qs.ji_jobid);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  unlock_queue(*parent, __func__, NULL, 0);

  *parent = NULL;
  *dest   = NULL;

  lock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL);

  index_parent = get_value_hash(svr_queues.ht, queue_parent_name);
  index_dest   = get_value_hash(svr_queues.ht, queue_dest_name);

  if ((index_parent < 0) ||
      (index_dest < 0))
    {
    rc = -1;
    }
  else
    {
    /* good path */
    pque_parent = svr_queues.ra->slots[index_parent].item;
    pque_dest   = svr_queues.ra->slots[index_dest].item;

    if ((pque_parent == NULL) ||
        (pque_dest == NULL))
      {
      rc = -1;
      }
    else
      {
      /* SUCCESS! */
      lock_queue(pque_parent, __func__, NULL, 0);
      lock_queue(pque_dest,   __func__, NULL, 0);
      *parent = pque_parent;
      *dest = pque_dest;

      rc = PBSE_NONE;
      }
    }

  unlock_allques_mutex(&svr_queues, __func__, NULL, LOGLEVEL);

  if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL)
    rc = -1;

  return(rc);
  } /* END get_parent_dest_queues() */
Exemple #17
0
int get_parent_dest_queues(

  char       *queue_parent_name,
  char       *queue_dest_name,
  pbs_queue **parent,
  pbs_queue **dest,
  job       **pjob_ptr)

  {
  pbs_queue *pque_parent;
  pbs_queue *pque_dest;
  char       jobid[PBS_MAXSVRJOBID + 1];
  job       *pjob = *pjob_ptr;
  int        index_parent;
  int        index_dest;
  int        rc = PBSE_NONE;

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  unlock_queue(*parent, __func__, NULL, 0);

  *parent = NULL;
  *dest   = NULL;

  pthread_mutex_lock(svr_queues.allques_mutex);

  index_parent = get_value_hash(svr_queues.ht, queue_parent_name);
  index_dest   = get_value_hash(svr_queues.ht, queue_dest_name);

  if ((index_parent < 0) ||
      (index_dest < 0))
    {
    rc = -1;
    }
  else
    {
    /* good path */
    pque_parent = svr_queues.ra->slots[index_parent].item;
    pque_dest   = svr_queues.ra->slots[index_dest].item;

    if ((pque_parent == NULL) ||
        (pque_dest == NULL))
      {
      rc = -1;
      }
    else
      {
      /* SUCCESS! */
      lock_queue(pque_parent, __func__, NULL, 0);
      lock_queue(pque_dest,   __func__, NULL, 0);
      *parent = pque_parent;
      *dest = pque_dest;

      rc = PBSE_NONE;
      }
    }

  pthread_mutex_unlock(svr_queues.allques_mutex);

  if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL)
    rc = -1;

  return(rc);
  } /* END get_parent_dest_queues() */
Exemple #18
0
int delete_inactive_job(

  job        **pjob_ptr,
  const char  *Msg)

  {
  job  *pjob;
  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (pjob_ptr == NULL)
    return(PBSE_BAD_PARAMETER);

  pjob = *pjob_ptr;
  
  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int               KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      set_task(WORK_Timed, time(NULL) + KeepSeconds, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (pjob == NULL)
    *pjob_ptr = NULL;

  return(PBSE_NONE);
  } /* END delete_inactive_job() */
Exemple #19
0
int req_stat_job(

  struct batch_request *preq)  /* ptr to the decoded request */

  {
  struct stat_cntl     *cntl; /* see svrfunc.h  */
  char                 *name;
  job                  *pjob = NULL;
  pbs_queue            *pque = NULL;
  int                   rc = PBSE_NONE;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  enum TJobStatTypeEnum type = tjstNONE;

  /*
   * first, validate the name of the requested object, either
   * a job, a queue, or the whole server.
   */
  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "note");
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }


  /* FORMAT:  name = { <JOBID> | <QUEUEID> | '' } */

  name = preq->rq_ind.rq_status.rq_id;

  if (preq->rq_extend != NULL)
    {
    /* evaluate pbs_job_stat() 'extension' field */

    if (!strncasecmp(preq->rq_extend, "truncated", strlen("truncated")))
      {
      /* truncate response by 'max_report' */

      type = tjstTruncatedServer;
      }
    else if (!strncasecmp(preq->rq_extend, "summarize_arrays", strlen("summarize_arrays")))
      {
      type = tjstSummarizeArraysServer;
      }

    }    /* END if (preq->rq_extend != NULL) */

  if (isdigit((int)*name))
    {
    /* status a single job */

    if (is_array(name))
      {
      if (type != tjstSummarizeArraysServer)
        {
        type = tjstArray;
        }
      }
    else
      {
      type = tjstJob;

      if ((pjob = svr_find_job(name, FALSE)) == NULL)
        {
        rc = PBSE_UNKJOBID;
        }
      else
        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }
    }
  else if (isalpha(name[0]))
    {
    if (type == tjstNONE)
      type = tjstQueue;
    else if (type == tjstSummarizeArraysServer)
      type = tjstSummarizeArraysQueue;
    else
      type = tjstTruncatedQueue;

    /* if found, this mutex is released later */
    if ((pque = find_queuebyname(name)) == NULL)
      {
      rc = PBSE_UNKQUE;
      }
    }
  else if ((*name == '\0') || (*name == '@'))
    {
    /* status all jobs at server */

    if (type == tjstNONE)
      type = tjstServer;
    }
  else
    {
    rc = PBSE_IVALREQ;
    }

  if (rc != 0)
    {
    /* is invalid - an error */
    req_reject(rc, 0, preq, NULL, NULL);

    return(rc);
    }

  preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_Status;

  CLEAR_HEAD(preq->rq_reply.brp_un.brp_status);

  cntl = (struct stat_cntl *)calloc(1, sizeof(struct stat_cntl));

  if (cntl == NULL)
    {
    if (pque != NULL) 
      unlock_queue(pque, "req_stat_job", (char *)"no memory cntl", LOGLEVEL);
    req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

    return(PBSE_SYSTEM);
    }

  if ((type == tjstTruncatedQueue) ||
      (type == tjstTruncatedServer))
    {
    if (pque != NULL)
      {
      unlock_queue(pque, __func__, "", LOGLEVEL);
      pque = NULL;
      }
    }

  cntl->sc_type   = (int)type;
  cntl->sc_conn   = -1;
  cntl->sc_pque   = pque;
  cntl->sc_origrq = preq;
  cntl->sc_post   = req_stat_job_step2;
  cntl->sc_jobid[0] = '\0'; /* cause "start from beginning" */

  req_stat_job_step2(cntl); /* go to step 2, see if running is current */

  if (pque != NULL)
    unlock_queue(pque, "req_stat_job", (char *)"success", LOGLEVEL);

  free(cntl);
  return(PBSE_NONE);
  }  /* END req_stat_job() */
Exemple #20
0
void rerun_or_kill(

  job  **pjob_ptr, /* I (modified/freed) */
  char  *text)     /* I */

  {
  long       server_state = SV_STATE_DOWN;
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  pbs_queue *pque;
  job       *pjob = *pjob_ptr;

  get_svr_attr_l(SRV_ATR_State, &server_state);
  if (pjob->ji_wattr[JOB_ATR_rerunable].at_val.at_long)
    {
    /* job is rerunable, mark it to be requeued */

    issue_signal(&pjob, "SIGKILL", free_br, NULL);

    if (pjob != NULL)
      {
      pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
      if ((pque = get_jobs_queue(&pjob)) != NULL)
        {
        snprintf(log_buf, sizeof(log_buf), "%s%s%s", msg_init_queued, pque->qu_qs.qu_name, text);

        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        }
      }
    }
  else if (server_state != SV_STATE_SHUTDEL)
    {
    /* job not rerunable, immediate shutdown - kill it off */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_job_abort, text);

    /* need to record log message before purging job */

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);

    job_abt(pjob_ptr, log_buf);

    return;
    }
  else
    {
    /* delayed shutdown, leave job running */
    snprintf(log_buf, sizeof(log_buf), "%s%s", msg_leftrunning, text);
    }

  if (pjob != NULL)
    {
    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_JOB | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);
    }

  return;
  }  /* END rerun_or_kill() */
Exemple #21
0
static void req_stat_job_step2(

  struct stat_cntl *cntl)  /* I/O (free'd on return) */

  {
  svrattrl              *pal;
  job                   *pjob = NULL;

  struct batch_request  *preq;
  struct batch_reply    *preply;
  int                    rc = 0;
  enum TJobStatTypeEnum  type;
  pbs_queue             *pque = NULL;
  int                    exec_only = 0;

  int                    bad = 0;
  long                   DTime;  /* delta time - only report full pbs_attribute list if J->MTime > DTime */
  static svrattrl       *dpal = NULL;
  int                    job_array_index = 0;
  job_array             *pa = NULL;
  char                   log_buf[LOCAL_LOG_BUF_SIZE];
  all_jobs_iterator      *iter;

  preq   = cntl->sc_origrq;
  type   = (enum TJobStatTypeEnum)cntl->sc_type;
  preply = &preq->rq_reply;

  /* See pbs_server_attributes(1B) for details on "poll_jobs" behaviour */

  if (dpal == NULL)
    {
    /* build 'delta' pbs_attribute list */

    svrattrl *tpal;

    tlist_head dalist;

    int aindex;

    int atrlist[] =
      {
      JOB_ATR_jobname,
      JOB_ATR_resc_used,
      JOB_ATR_LAST
      };

    CLEAR_LINK(dalist);

    for (aindex = 0;atrlist[aindex] != JOB_ATR_LAST;aindex++)
      {
      if ((tpal = attrlist_create("", "", 23)) == NULL)
        {
        return;
        }

      tpal->al_valln = atrlist[aindex];

      if (dpal == NULL)
        dpal = tpal;

      append_link(&dalist, &tpal->al_link, tpal);
      }
    }  /* END if (dpal == NULL) */

  if (type == tjstArray)
    {
    pa = get_array(preq->rq_ind.rq_status.rq_id);

    if (pa == NULL)
      {
      req_reject(PBSE_UNKARRAYID, 0, preq, NULL, "unable to find array");
      return;
      }
    }

  {
  all_jobs *ajptr = NULL;

  if (type == tjstQueue)
    ajptr = cntl->sc_pque->qu_jobs;

  else if (type == tjstSummarizeArraysQueue)
    ajptr = cntl->sc_pque->qu_jobs_array_sum;

  else if (type == tjstSummarizeArraysServer)
    ajptr = &array_summary;

  else
    ajptr = &alljobs;

  ajptr->lock();
  iter = ajptr->get_iterator();
  ajptr->unlock();
  }

  /*
   * now ready for part 3, building the status reply,
   * loop through again
   */

  if ((type == tjstSummarizeArraysQueue) || 
      (type == tjstSummarizeArraysServer))
    {
    /* No array can be owned for these options */
    update_array_statuses();
    }


  if (type == tjstJob)
    pjob = svr_find_job(preq->rq_ind.rq_status.rq_id, FALSE);

  else if (type == tjstQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs,iter);

  else if (type == tjstSummarizeArraysQueue)
    pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);

  else if (type == tjstSummarizeArraysServer)
    pjob = next_job(&array_summary,iter);

  else if (type == tjstArray)
    {
    job_array_index = -1;
    pjob = NULL;
    /* increment job_array_index until we find a non-null pointer or hit the end */
    while (++job_array_index < pa->ai_qs.array_size)
      {
      if (pa->job_ids[job_array_index] != NULL)
        {
        if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
          {
          break;
          }
        }
      }
    }
  else
    pjob = next_job(&alljobs,iter);

  DTime = 0;

  if (preq->rq_extend != NULL)
    {
    char *ptr;

    /* FORMAT:  { EXECQONLY | DELTA:<EPOCHTIME> } */

    if (strstr(preq->rq_extend, EXECQUEONLY))
      exec_only = 1;

    ptr = strstr(preq->rq_extend, "DELTA:");

    if (ptr != NULL)
      {
      ptr += strlen("delta:");

      DTime = strtol(ptr, NULL, 10);
      }
    }

  if ((type == tjstTruncatedServer) || 
      (type == tjstTruncatedQueue))
    {
    long sentJobCounter;
    long qjcounter;
    long qmaxreport;
    all_queues_iterator *iter = NULL;

    svr_queues.lock();
    iter = svr_queues.get_iterator();
    svr_queues.unlock();

    /* loop through all queues */
    while ((pque = next_queue(&svr_queues,iter)) != NULL)
      {
      qjcounter = 0;

      if ((exec_only == 1) &&
          (pque->qu_qs.qu_type != QTYPE_Execution))
        {
        /* ignore routing queues */
        unlock_queue(pque, __func__, "ignore queue", LOGLEVEL);
        continue;
        }

      if (((pque->qu_attr[QA_ATR_MaxReport].at_flags & ATR_VFLAG_SET) != 0) &&
          (pque->qu_attr[QA_ATR_MaxReport].at_val.at_long >= 0))
        {
        qmaxreport = pque->qu_attr[QA_ATR_MaxReport].at_val.at_long;
        }
      else
        {
        qmaxreport = TMAX_JOB;
        }

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"giving scheduler up to %ld idle jobs in queue %s\n",
          qmaxreport,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }

      sentJobCounter = 0;

      /* loop through jobs in queue */
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);

      all_jobs_iterator *jobiter = NULL;
      pque->qu_jobs->lock();
      jobiter = pque->qu_jobs->get_iterator();
      pque->qu_jobs->unlock();

      while ((pjob = next_job(pque->qu_jobs,jobiter)) != NULL)
        {
        if ((qjcounter >= qmaxreport) &&
            (pjob->ji_qs.ji_state == JOB_STATE_QUEUED))
          {
          /* max_report of queued jobs reached for queue */
          unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);

          continue;
          }

        pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

        rc = status_job(
               pjob,
               preq,
               (pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long >= DTime) ? pal : dpal,
               &preply->brp_un.brp_status,
               &bad);

        if ((rc != 0) && (rc != PBSE_PERM))
          {
          req_reject(rc, bad, preq, NULL, NULL);

          unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);
          unlock_queue(pque, __func__, "perm", LOGLEVEL);

          delete iter;

          return;
          }

        sentJobCounter++;

        if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
          qjcounter++;

        unlock_ji_mutex(pjob, __func__, "8", LOGLEVEL);
        }    /* END foreach (pjob from pque) */

      if (LOGLEVEL >= 5)
        {
        sprintf(log_buf,"sent scheduler %ld total jobs for queue %s\n",
          sentJobCounter,
          pque->qu_qs.qu_name);

        log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_QUEUE,pque->qu_qs.qu_name,log_buf);
        }
    
      unlock_queue(pque, __func__, "end while", LOGLEVEL);
      }      /* END for (pque) */

    reply_send_svr(preq);

    delete iter;

    return;
    } /* END if ((type == tjstTruncatedServer) || ...) */

  while (pjob != NULL)
    {
    /* go ahead and build the status reply for this job */

    if (exec_only)
      {
      if (cntl->sc_pque != NULL)
        {
        if (cntl->sc_pque->qu_qs.qu_type != QTYPE_Execution)
          goto nextjob;
        }
      else
        {
        if (pa != NULL)
          pthread_mutex_unlock(pa->ai_mutex);
        pque = get_jobs_queue(&pjob);
        if (pa != NULL)
          pthread_mutex_lock(pa->ai_mutex);

        if ((pjob == NULL) ||
            (pque == NULL))
          goto nextjob;
        
        mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
        if (pque->qu_qs.qu_type != QTYPE_Execution)
          {
          goto nextjob;
          }
        }
      }

    pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_status.rq_attr);

    rc = status_job(
           pjob,
           preq,
           pal,
           &preply->brp_un.brp_status,
           &bad);

    if ((rc != 0) && 
        (rc != PBSE_PERM))
      {
      if (pa != NULL)
        {
        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }

      unlock_ji_mutex(pjob, __func__, "9", LOGLEVEL);

      req_reject(rc, bad, preq, NULL, NULL);

      delete iter;

      return;
      }

    /* get next job */

nextjob:

    if (pjob != NULL)
      unlock_ji_mutex(pjob, __func__, "10", LOGLEVEL);

    if (type == tjstJob)
      break;

    if (type == tjstQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs,iter);
    else if (type == tjstSummarizeArraysQueue)
      pjob = next_job(cntl->sc_pque->qu_jobs_array_sum,iter);
    else if (type == tjstSummarizeArraysServer)
      pjob = next_job(&array_summary,iter);
    else if (type == tjstArray)
      {
      pjob = NULL;
      /* increment job_array_index until we find a non-null pointer or hit the end */
      while (++job_array_index < pa->ai_qs.array_size)
        {
        if (pa->job_ids[job_array_index] != NULL)
          {
          if ((pjob = svr_find_job(pa->job_ids[job_array_index], FALSE)) != NULL)
            {
            break;
            }
          }
        }
      }
    else
      pjob = next_job(&alljobs,iter);

    rc = 0;
    }  /* END while (pjob != NULL) */

  delete iter;

  if (pa != NULL)
    {
    unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
    }
 
  reply_send_svr(preq);

  if (LOGLEVEL >= 7)
    {
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_JOB,
      "req_statjob",
      "Successfully returned the status of queued jobs\n");
    }

  return;
  }  /* END req_stat_job_step2() */
int acct_job(

  job            *pjob, /* I */
  dynamic_string *ds)   /* O */

  {
  int         rc;
  long        cray_enabled = FALSE;
  int         resc_access_perm = READ_ONLY;
  char        local_buf[MAXLINE*4];
  pbs_queue  *pque;

  tlist_head  attrlist;
  svrattrl   *pal;

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  CLEAR_HEAD(attrlist);

  /* user */

	/* acct_job is only called from account_jobstr and account_jobend. BufSize should be
	 	 PBS_ACCT_MAX_RCD + 1 in size. */
  sprintf(local_buf, "user=%s ",
    pjob->ji_wattr[JOB_ATR_euser].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* group */
  sprintf(local_buf, "group=%s ",
    pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* account */
  if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "account=%s ",
      pjob->ji_wattr[JOB_ATR_account].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

  /* job name */
  sprintf(local_buf, "jobname=%s ",
    pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    /* queue name */
    sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name);
    unlock_queue(pque, __func__, NULL, LOGLEVEL);

    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }
  else if (pjob == NULL)
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1");
    return(PBSE_JOBNOTFOUND);
    }

  /* create time */
  sprintf(local_buf, "ctime=%ld ",
    pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* queued time */
  sprintf(local_buf, "qtime=%ld ",
    pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* eligible time, how long ready to run */
  sprintf(local_buf, "etime=%ld ",
    pjob->ji_wattr[JOB_ATR_etime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* execution start time */
  sprintf(local_buf, "start=%ld ",
    (long)pjob->ji_qs.ji_stime);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* user */
  sprintf(local_buf, "owner=%s ",
    pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);
 
  /* For large clusters strings can get pretty long. We need to see if there
     is a need to allocate a bigger buffer */
  /* execution host name */
  if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {
    append_dynamic_string(ds, "exec_host=");
    append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);
    }

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if ((cray_enabled == TRUE) &&
      (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET))
    {
    append_dynamic_string(ds, "login_node=");
    append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);
    }

  /* now encode the job's resource_list pbs_attribute */
  job_attr_def[JOB_ATR_resource].at_encode(
    &pjob->ji_wattr[JOB_ATR_resource],
    &attrlist,
    job_attr_def[JOB_ATR_resource].at_name,
    NULL,
    ATR_ENCODE_CLIENT,
    resc_access_perm);

  while ((pal = GET_NEXT(attrlist)) != NULL)
    {
		/* exec_host can use a lot of buffer space. Use a dynamic string */
    append_dynamic_string(ds, pal->al_name);

    if (pal->al_resc != NULL)
      {
      append_dynamic_string(ds, ".");
      append_dynamic_string(ds, pal->al_resc);
      }

    append_dynamic_string(ds, "=");
    append_dynamic_string(ds, pal->al_value);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);

    delete_link(&pal->al_link);
    free(pal);
    }  /* END while (pal != NULL) */

#ifdef ATTR_X_ACCT

  /* x attributes */
  if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "x=%s ",
            pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

#endif

  /* SUCCESS */

  return(PBSE_NONE);
  }  /* END acct_job() */
void set_resc_assigned(

  job *pjob,         /* I */
  enum batch_op op)  /* INCR or DECR */

  {
  resource      *jobrsc;
  resource      *pr;
  pbs_attribute *queru;
  resource_def  *rscdef;
  pbs_attribute *sysru;
  pbs_queue     *pque;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  if ((pjob == NULL))
    return;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    if (pque->qu_qs.qu_type == QTYPE_Execution)
      {
      if (op == DECR)
        {
        /* if freeing completed job resources, ignore constraint (???) */
        /* NO-OP */
        }
      }
    else
      {
      snprintf(log_buf,sizeof(log_buf),
        "job %s isn't in an execution queue, can't modify resources\njob is in queue %s",
        pjob->ji_qs.ji_jobid,
        pque->qu_qs.qu_name);
      log_err(-1, __func__, log_buf);
    
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      return;
      }
  
    if (op == INCR)
      {
      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn)
        {
        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        return;  /* already added in */
        }
      
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_RescAssn;
      }
    else if (op == DECR)
      {
      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) == 0)
        {
        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        return;  /* not currently included */
        }
      
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn;
      }
    else
      {
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      return;   /* invalid op */
      }
    
    sysru = &server.sv_attr[SRV_ATR_resource_assn];

    queru = &pque->qu_attr[QE_ATR_ResourceAssn];
    jobrsc = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);

    while (jobrsc != NULL)
      {
      rscdef = jobrsc->rs_defin;

      /* if resource usage is to be tracked */

      if ((rscdef->rs_flags & ATR_DFLAG_RASSN) &&
          (jobrsc->rs_value.at_flags & ATR_VFLAG_SET))
        {
        /* update system pbs_attribute of resources assigned */

        pr = find_resc_entry(sysru, rscdef);

        if (pr == NULL)
          {
          pr = add_resource_entry(sysru, rscdef);

          if (pr == NULL)
            {
            unlock_queue(pque, __func__, "sysru", LOGLEVEL);
            return;
            }
          }

        rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op);

        /* update queue pbs_attribute of resources assigned */

        pr = find_resc_entry(queru, rscdef);

        if (pr == NULL)
          {
          pr = add_resource_entry(queru, rscdef);

          if (pr == NULL)
            {
            unlock_queue(pque, __func__, "queru", LOGLEVEL);
            return;
            }
          }

        rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op);
        }

      jobrsc = (resource *)GET_NEXT(jobrsc->rs_link);
      }  /* END while (jobrsc != NULL) */

    unlock_queue(pque, __func__, "success", LOGLEVEL);
    }
  else if (pjob == NULL)
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 9");
    }

  return;
  }  /* END set_resc_assigned() */
void *queue_route(

  void *vp)

  {
  pbs_queue *pque;
  job       *pjob = NULL;
  char      *queue_name;
  char      log_buf[LOCAL_LOG_BUF_SIZE];

  int       iter = -1;
  time_t    time_now = time(NULL);

  queue_name = (char *)vp;

  if (queue_name == NULL)
    {
    sprintf(log_buf, "NULL queue name");
    log_err(-1, __func__, log_buf);
    return(NULL);
    }

  if (LOGLEVEL >= 7)
    {
    snprintf(log_buf, sizeof(log_buf), "queue name: %s", queue_name);
    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf);
    }
  
  pthread_mutex_lock(reroute_job_mutex);

  pque = find_queuebyname(queue_name);
  if (pque == NULL)
    {
    sprintf(log_buf, "Could not find queue %s", queue_name);
    log_err(-1, __func__, log_buf);
    free(queue_name);
    pthread_mutex_unlock(reroute_job_mutex);
    return(NULL);
    }

  while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL)
    {
    /* the second condition says we only want to try if routing
     * has been tried once - this is to let req_commit have the 
     * first crack at routing always */
    unlock_queue(pque, __func__, (char *)NULL, 0);
    if ((pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now - ROUTE_RETRY_TIME) &&
        (pjob->ji_qs.ji_un.ji_routet.ji_rteretry != 0))
      {
      reroute_job(pjob, pque);
      unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
      }
    else
      unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);
    }

  free(queue_name);
  unlock_queue(pque, __func__, (char *)NULL, 0);
  pthread_mutex_unlock(reroute_job_mutex);
  return(NULL);
  } /* END queue_route() */
Exemple #25
0
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int                   delay = 0;
  int                   dellen = strlen(deldelaystr);
  job                  *pjob;

  pbs_queue            *pque;

  char                 *preq_clt_id;
  struct batch_request *preq_sig;         /* signal request to MOM */

  struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;
  preq_clt_id = preq_sig->rq_extra;

  free_br(preq_sig);

  if (preq_clt_id != NULL)
    {
    preq_clt = get_remove_batch_request(preq_clt_id);
    free(preq_clt_id);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE);

  if (pjob == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */
  if (delay == 0)
    {
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
      pthread_mutex_unlock(server.sv_attr_mutex);
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    else if (pjob != NULL)
      return;
    }

  set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE);

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */
  apply_job_delete_nanny(pjob, time_now + delay + 60);

  unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  }  /* END post_delete_mom1() */
int get_parent_dest_queues(

  char       *queue_parent_name,
  char       *queue_dest_name,
  pbs_queue **parent,
  pbs_queue **dest,
  job       **pjob_ptr)

  {
  pbs_queue *pque_parent;
  pbs_queue *pque_dest;
  char       jobid[PBS_MAXSVRJOBID + 1];
  char       log_buf[LOCAL_LOG_BUF_SIZE + 1];
  job       *pjob = *pjob_ptr;
  int        index_parent;
  int        index_dest;
  int        rc = PBSE_NONE;

  strcpy(jobid, pjob->ji_qs.ji_jobid);

  if ((queue_parent_name != NULL) && (queue_dest_name != NULL))
    {
    if (!strcmp(queue_parent_name, queue_dest_name))
      {
      /* parent and destination are the same. 
         Job is already in destnation queue. return */
      snprintf(log_buf, sizeof(log_buf), "parent and destination queues are the same: parent %s - dest %s. jobid: %s",
          queue_parent_name,
          queue_dest_name,
          jobid);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
      return(-1); 
      }
    }
  else
    return(-1);

  unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL);

  unlock_queue(*parent, __func__, (char *)NULL, 0);

  *parent = NULL;
  *dest   = NULL;

  pthread_mutex_lock(svr_queues.allques_mutex);

  index_parent = get_value_hash(svr_queues.ht, queue_parent_name);
  index_dest   = get_value_hash(svr_queues.ht, queue_dest_name);

  if ((index_parent < 0) ||
      (index_dest < 0))
    {
    rc = -1;
    }
  else
    {
    /* good path */
    pque_parent = svr_queues.ra->slots[index_parent].item;
    pque_dest   = svr_queues.ra->slots[index_dest].item;

    if ((pque_parent == NULL) ||
        (pque_dest == NULL))
      {
      rc = -1;
      }
    else
      {
      /* SUCCESS! */
      lock_queue(pque_parent, __func__, (char *)NULL, 0);
      lock_queue(pque_dest,   __func__, (char *)NULL, 0);
      *parent = pque_parent;
      *dest = pque_dest;

      rc = PBSE_NONE;
      }
    }

  pthread_mutex_unlock(svr_queues.allques_mutex);

  if ((*pjob_ptr = svr_find_job(jobid, TRUE)) == NULL)
    rc = -1;

  return(rc);
  } /* END get_parent_dest_queues() */