Beispiel #1
0
END_TEST

START_TEST(test_one)
  {
  int rc;
  initialize_all_tasks_array(&task_list_event);
  initialize_task_recycler();

  if (task_list_timed == NULL)
    task_list_timed = new std::list<timed_task>();

  rc = initialize_threadpool(&request_pool, 5, 50, 60);
  fail_unless(rc == PBSE_NONE, "initalize_threadpool failed", rc);

  struct work_task *pWorkTask = set_task(WORK_Timed,357,check_nodes,NULL,0);
  fail_unless(pWorkTask != NULL);
  struct work_task *pWorkTask2 = set_task(WORK_Timed,356,check_nodes,NULL,0);
  fail_unless(pWorkTask2 != NULL);
  struct work_task *pWorkTask3 = set_task(WORK_Timed,358,check_nodes,NULL,0);
  fail_unless(pWorkTask3 != NULL);

  rc = dispatch_task(pWorkTask);
  fail_unless(rc == PBSE_NONE, "dispatch_task failed", rc);
  delete_task(pWorkTask);

  int iter = -1;
  struct work_task *pRecycled = next_task_from_recycler(&tr.tasks,&iter);
  fprintf(stderr,"%p %p\n",(void *)pWorkTask,(void *)pRecycled);
  fail_unless(pRecycled == pWorkTask);
  fail_unless(task_is_in_threadpool(pWorkTask2));
  }
Beispiel #2
0
acl_int64 aio_timer_callback::set_task(unsigned int id, acl_int64 delay)
{
	aio_timer_task* task = NULL;
	std::list<aio_timer_task*>::iterator it = tasks_.begin();
	for (; it != tasks_.end(); ++it)
	{
		if ((*it)->id == id)
		{
			task = (*it);
			tasks_.erase(it);
			length_--;
			break;
		}
	}

	if (task == NULL)
	{
		task = NEW aio_timer_task();
		task->delay = delay;
		task->id = id;
	}
	else
		task->delay = delay;

	return set_task(task);
}
Beispiel #3
0
void chkpt_xfr_hold(

  batch_request *preq,
  job           *pjob)

  {
  char   log_buf[LOCAL_LOG_BUF_SIZE];

  if ((preq == NULL) ||
      (preq->rq_extra == NULL) ||
      (pjob == NULL))
    return;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  free_br(preq);

  set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);

  return;
  }  /* END chkpt_xfr_hold() */
Beispiel #4
0
/**
 * @brief
 * 		cnvrt_delete - delete reservation when no reservation job for 10 min
 *
 * @param[in,out]	pwt	-	work task structure which contains the reservation
 */
void
cnvrt_delete(struct work_task *ptask)
{
	int flag = FALSE;
	resc_resv *ptmp, *presv;
	struct work_task *wt;

	ptmp = (resc_resv *)ptask->wt_parm1;
	presv = (resc_resv *)GET_NEXT(svr_allresvs);
	if (presv == NULL || ptmp == NULL) return;

	while (presv) {
		if ((presv->ri_wattr[(int)RESV_ATR_convert].at_val.at_str != NULL) &&
			(ptmp->ri_wattr[(int)RESV_ATR_convert].at_val.at_str != NULL)) {
			if (strcmp(presv->ri_wattr[(int)RESV_ATR_convert].at_val.at_str,
				ptmp->ri_wattr[(int)RESV_ATR_convert].at_val.at_str) == 0) {
				flag = TRUE;
				break;
			}
		}
		presv = (resc_resv *)GET_NEXT(presv->ri_allresvs);
	}

	if (presv == NULL && flag == FALSE) return;

	if (flag == TRUE  &&  ptmp->ri_qp->qu_numjobs == 0) {
		gen_future_deleteResv(ptmp, 10);
		return;
	}

	wt = set_task(WORK_Timed, (time_now + 600), cnvrt_delete, ptmp);
	append_link(&presv->ri_svrtask, &wt->wt_linkobj, wt);
}
void chkpt_xfr_hold(

  struct work_task *ptask)

  {
  job       *pjob;
  struct work_task *ptasknew;

  struct batch_request *preq;

  preq = (struct batch_request *)ptask->wt_parm1;
  pjob = (job *)preq->rq_extra;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buffer,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);
    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }
  
  release_req(ptask);

  ptasknew = set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, (void*)pjob);

  return;
  }  /* END chkpt_xfr_hold() */
Beispiel #6
0
/**
 *	@brief
 *		add a task to the moms deferred command list
 *		of commands issued to the server
 *
 *		Used only in case of TPP
 *
 * @param[in] stream - stream on which command is being sent
 * @param[in] minfo  - The mominfo_t pointer for the mom
 * @param[in] func   - Call back func when mom responds
 * @param[in] msgid  - String unique identifying the command from others
 * @param[in] parm1  - Fist parameter to the work task to be set
 * @param[in] parm2  - Second parameter to the work task to be set
 *
 * @return Work task structure that was allocated and added to moms deferred cmd list
 * @retval NULL  - Failure
 * @retval !NULL - Success
 *
 */
struct work_task *
add_mom_deferred_list(int stream, mominfo_t *minfo, void (*func)(), char *msgid, void *parm1, void *parm2)
{
	struct work_task *ptask = NULL;

	/* WORK_Deferred_cmd is very similar to WORK_Deferred_reply.
	 * However in case of WORK_Deferred_reply, the wt_parm1 is assumed to
	 * contain a batch_request structure. In cases where there is no
	 * batch_request structure associated, we use the WORK_Deferred_cmd
	 * event type to differentiate it in process_DreplyRPP.
	 */
	ptask = set_task(WORK_Deferred_cmd, (long) stream, func, parm1);
	if (ptask == NULL) {
		log_err(errno, __func__, "could not set_task");
		return NULL;
	}
	ptask->wt_aux2 = 1; /* set to rpp */
	ptask->wt_parm2 = parm2;
	ptask->wt_event2 = msgid;

	/* remove this task from the event list, as we will be adding to deferred list anyway
	 * and there is no child process whose exit needs to be reaped
	 */
	delete_link(&ptask->wt_linkall);

	/* append to the moms deferred command list */
	append_link(&(((mom_svrinfo_t *) (minfo->mi_data))->msr_deferred_cmds), &ptask->wt_linkobj2, ptask);
	return ptask;
}
Beispiel #7
0
acl_int64 event_timer::set_task(unsigned int id, acl_int64 delay)
{
	if (delay < 0)
	{
		logger_error("invalid task, id: %u, delay: %lld", id, delay);
		return -1;
	}

	event_task* task = NULL;
	std::list<event_task*>::iterator it = tasks_.begin();
	for (; it != tasks_.end(); ++it)
	{
		if ((*it)->id == id)
		{
			task = (*it);
			tasks_.erase(it);
			length_--;
			break;
		}
	}

	if (task == NULL)
	{
		task = NEW event_task();
		task->delay = delay;
		task->id = id;
	}
	else
		task->delay = delay;

	return set_task(task);
}
Beispiel #8
0
int main(void)
{	
	W5100_RESET_DDR |= (1 << W5100_RESET);
	W5100_RESET_PORT |= (1 << W5100_RESET);
	
	_delay_ms(100);
	w5100.hard_reset();
	_delay_ms(100);
	
	Spi.init();
	UART.init();
	
	_delay_ms(100);
	w5100.soft_reset();
	_delay_ms(100);
	
	w5100.set_mac(DEV_MAC_5, DEV_MAC_4, DEV_MAC_3, DEV_MAC_2, DEV_MAC_1, DEV_MAC_0);
	w5100.set_ip(DEV_IP_3, DEV_IP_2, DEV_IP_1, DEV_IP_0);
	w5100.set_mask(NET_MASK_3, NET_MASK_2, NET_MASK_1, NET_MASK_0);
	w5100.set_gateway(NET_GW_3, NET_GW_2, NET_GW_1, NET_GW_0);
	w5100.set_other_options();
	
	UART.write_str("w5100 settings success\n\r");
	
	socket0.open_tcp(LOCAL_SERVER_PORT);
	
	socket0.listen();
	
	init_dispatcher();
	run_dispatcher();
	
	device_init();
	
	set_task(task_socket0_poll, 0);
	set_task(task_socket1_poll, 0);
	set_timer_task(task_heartbeat, 0, HEARTBEAT_INTERVAL_MS);
	set_timer_task(task_outcoming_requests_queue_service, 0, REQUESTS_QUEUE_SERVICE_INTERVAL_MS);
	sprintf(large_txt_buf, "start s0 and s1 polling\r\n");
	UART.write_str(large_txt_buf);
	
	while(1)
	{
		task_manager();
	}
	
}
int main(void){

    init_devices();
    //
    //  start at least one task here
    //
    set_task(7);    //task7 runs
    set_task(6);    //task6 runs
    //      main loop 
    while(1){
        if (tick_flag){
            tick_flag = 0;
            task_dispatch();              // well.... 
        }
    }
    return 0;
}
Beispiel #10
0
int process_task(unsigned char *taskdata)
{
	int task,del;

	task=*(unsigned int*)(taskdata);

	switch(task) {
		case 0:		xlog("task test"); del=1; break;
		case 1:		del=set_task((void*)(taskdata),(int (*)(void*,struct character *))set_clan_rank); break;
		case 2:		del=set_task((void*)(taskdata),(int (*)(void*,struct character *))fire_from_clan); break;
                case 5:		del=set_task((void*)(taskdata),(int (*)(void*,struct character *))set_flags); break;

		default:	elog("deleting unknown task %d",task); del=1; break;
	}

	return del;
}
Beispiel #11
0
acl_int64 event_timer::trigger(void)
{
	// sanity check
	if (tasks_.empty())
		return TIMER_EMPTY;

	acl_assert(length_ > 0);

	set_time();

	std::list<event_task*> tasks;

	// 从定时器中取出到达的定时任务
	for (std::list<event_task*>::iterator it = tasks_.begin();
		it != tasks_.end();)
	{
		if ((*it)->when > present_)
			break;

		tasks.push_back(*it);
		it = tasks_.erase(it);
		length_--;
	}

	if (tasks.empty())
	{
		acl_assert(!tasks_.empty());

		event_task* first = tasks_.front();
		acl_int64 delay = first->when - present_;
		return delay < 0 ? 0 : delay;
	}

	for (std::list<event_task*>::iterator it = tasks.begin();
		it != tasks.end(); ++it)
	{
		set_task(*it);
		// 调用子类虚函数,触发定时器任务过程
		timer_callback((*it)->id);
	}

	tasks.clear();

	// 子类有可能会在 timer_callback 中删除了所有的定时任务
	if (tasks_.empty())
		return TIMER_EMPTY;

	event_task* first = tasks_.front();
	acl_int64 delay = first->when - present_;

	if (delay < 0)
		return 0;
	else if (delay > first->delay)  /* xxx */
		return first->delay;
	else
		return delay;
}
Beispiel #12
0
acl_int64 event_timer::trigger(void)
{
	// sanity check
	if (tasks_.empty())
		return TIMER_EMPTY;

	acl_assert(length_ > 0);

	set_time();

	std::list<event_task*>::iterator it, next;
	std::list<event_task*> tasks;
	event_task* task;

	// 从定时器中取出到达的定时任务
	for (it = tasks_.begin(); it != tasks_.end(); it = next)
	{
		if ((*it)->when > present_)
			break;
		next = it;
		++next;
		task = *it;
		tasks_.erase(it);
		length_--;
		tasks.push_back(task);
	}

	// 有可能这些到达的定时任务已经被用户提前删除了
	if (tasks.empty())
	{
		acl_assert(!tasks_.empty());

		event_task* first = tasks_.front();
		acl_int64 delay = first->when - present_;
		return delay < 0 ? 0 : delay;
	}

	for (it = tasks.begin(); it != tasks.end(); ++it)
	{
		set_task(*it);
		// 调用子类虚函数,触发定时器任务过程
		timer_callback((*it)->id);
	}

	tasks.clear();

	// 子类有可能会在 timer_callback 中删除了所有的定时任务
	if (tasks_.empty())
		return TIMER_EMPTY;

	event_task* first = tasks_.front();
	acl_int64 delay = first->when - present_;

	return delay < 0 ? 0 : delay;
}
int que_to_local_svr(struct batch_request *preq)                     /* I */
{
    preq->rq_fromsvr = 1;
    preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

    if (preq->rq_id == NULL)
        get_batch_request_id(preq);

    set_task(WORK_Immed, 0, reissue_to_svr, preq->rq_id, TRUE);
    return(PBSE_NONE);
}  /* END que_to_local_svr() */
Beispiel #14
0
/**
 * poll _job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.  The check of
 * SRV_ATR_PollJobs appears to be redundant.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  int        job_state = -1;

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);

      job_state = pjob->ji_qs.ji_state;
      job_mutex.unlock();

      get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
      if ((poll_jobs) && (job_state == JOB_STATE_RUNNING))
        {
        /* we need to throttle the number of outstanding threads are
           doing job polling. This prevents a problem where pbs_server
           gets hung waiting on I/O from the mom */
        pthread_mutex_lock(poll_job_task_mutex);
        if (current_poll_job_tasks < max_poll_job_tasks)
          {
          current_poll_job_tasks++;
          pthread_mutex_unlock(poll_job_task_mutex);

          stat_mom_job(job_id);

          pthread_mutex_lock(poll_job_task_mutex);
          current_poll_job_tasks--;
          }
        pthread_mutex_unlock(poll_job_task_mutex);

        
        /* add another task */
        set_task(WORK_Timed, time_now + JobStatRate, poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }

  free(ptask->wt_mutex);
  free(ptask);
  }  /* END poll_job_task() */
Beispiel #15
0
int
issue_to_svr(char *servern, struct batch_request *preq, void (*replyfunc)(struct work_task *))
{
	int	  do_retry = 0;
	int	  handle;
	pbs_net_t svraddr;
	char	 *svrname;
	unsigned int  port = pbs_server_port_dis;
	struct work_task *pwt;
	extern int pbs_failover_active;
	extern char primary_host[];
	extern char server_host[];

	(void)strcpy(preq->rq_host, servern);
	preq->rq_fromsvr = 1;
	preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;
	svrname = parse_servername(servern, &port);

	if ((pbs_failover_active != 0) && (svrname != NULL)) {
		/* we are the active secondary server in a failover config    */
		/* if the message is going to the primary,then redirect to me */
		size_t len;

		len = strlen(svrname);
		if (strncasecmp(svrname, primary_host, len) == 0) {
			if ((primary_host[(int)len] == '\0') ||
				(primary_host[(int)len] == '.'))
				svrname = server_host;
		}
	}
	svraddr = get_hostaddr(svrname);
	if (svraddr == (pbs_net_t)0) {
		if (pbs_errno == PBS_NET_RC_RETRY)
			/* Non fatal error - retry */
			do_retry = 1;
	} else {
		handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS, PROT_TCP);
		if (handle >= 0)
			return (issue_Drequest(handle, preq, replyfunc, 0, 0));
		else if (handle == PBS_NET_RC_RETRY)
			do_retry = 1;
	}

	/* if reached here, it didn`t go, do we retry? */

	if (do_retry) {
		pwt = set_task(WORK_Timed, (long)(time_now+(2*PBS_NET_RETRY_TIME)),
			reissue_to_svr, (void *)preq);
		pwt->wt_parm2 = (void *)replyfunc;
		return (0);
	} else
		return (-1);
}
void kernel_main()
{

	int i = 0xa0000;
	for( ; i <= 0xaffff; ++i ){
		asm_write_mem( i, 1 );
	}

	// Delete boot sector and temporary data.
	for( i = 0x1000; i <= 0x9fc00; ++i ){
		asm_write_mem( i, 0 );
	}
	
	init_palette();
	i = 0xa0000;
	for( ; i <= 0xaffff; ++i ){
		asm_write_mem( i, 0 );
	}
	
	asm_disable_intr();
	InitGDT();
	InitIDT();
	InitPIC();
	asm_enable_intr();


	set_task( 1, task_a, stack[ 0 ] + 2048 );
	set_task( 2, task_b, stack[ 1 ] + 2048 );


	switch_task_2( TASK_INFO_ADDR, TASK_INFO_ADDR + sizeof( TaskInfo ) );

	for(;;){
		print_str( 0, 20, "fin" );
		asm_halt();
	}

	asm_halt();
}
void track_save(

  struct work_task *pwt)  /* unused */

  {
  int        fd;
  char      *myid = "save_track";
  time_t     time_now = time(NULL);
  work_task *wt;

  /* set task for next round trip */

  if (pwt)    /* set up another work task for next time period */
    {
    free(pwt->wt_mutex);
    free(pwt);

    wt = set_task(WORK_Timed, (long)time_now + PBS_SAVE_TRACK_TM, track_save, NULL, FALSE);

    if (wt == NULL)
      log_err(errno, myid, "Unable to set task for save");
    }

  if (server.sv_trackmodifed == 0)
    return;   /* nothing to do this time */

  fd = open(path_track, O_WRONLY, 0);

  if (fd < 0)
    {
    log_err(errno, myid, "Unable to open tracking file");
    return;
    }

  if (write(fd, (char *)server.sv_track, server.sv_tracksize * sizeof(struct tracking)) !=
      (ssize_t)(server.sv_tracksize * sizeof(struct tracking)))
    {
    log_err(errno, myid, "failed to write to track file");
    }

  if (close(fd) < 0)
    {
    log_err(errno, myid, "failed to close track file after saving");

    return;
    }

  server.sv_trackmodifed = 0;

  return;
  }
Beispiel #18
0
struct work_task *apply_job_delete_nanny(

  struct job *pjob,
  int         delay)  /* I */

  {
  struct work_task *pwtnew;
  enum work_type tasktype;

  /* short-circuit if nanny isn't enabled */

  if (!server.sv_attr[SRV_ATR_JobNanny].at_val.at_long)
    {
    remove_job_delete_nanny(pjob); /* in case it was recently disabled */

    return(NULL);
    }

  if (delay == 0)
    {
    tasktype = WORK_Immed;
    }
  else if (delay > 0)
    {
    tasktype = WORK_Timed;
    }
  else
    {
    log_err(-1, "apply_job_delete_nanny", "negative delay requested for nanny");

    return(NULL);
    }

  /* first, surgically remove any existing nanny tasks */

  remove_job_delete_nanny(pjob);

  /* second, add a nanny task at the requested time */

  pwtnew = set_task(tasktype, delay, job_delete_nanny, (void *)pjob);

  if (pwtnew)
    {
    /* insure that work task will be removed if job goes away */

    append_link(&pjob->ji_svrtask, &pwtnew->wt_linkobj, pwtnew);
    }

  return(pwtnew);
  } /* END apply_job_delete_nanny() */
GList *get_values(int num_tasks){

	int i;
	int period, wcet;
	GList *list = NULL;

	for(i = 0; i < num_tasks; i++){
		printf("Digite o Period e o Wcet da Tarefa %d:\t", i+1);
		scanf("%d %d", &period, &wcet );
		list = g_list_append(list, set_task(period, wcet, 0));
	}

	return list;

}
Beispiel #20
0
/**
 * poll_job_task
 *
 * The invocation of this routine is triggered from
 * the pbs_server main_loop code.
 */
void poll_job_task(

  struct work_task *ptask)

  {
  char      *job_id = (char *)ptask->wt_parm1;
  job       *pjob;
  time_t     time_now = time(NULL);
  long       poll_jobs = 0;
  long       job_stat_rate;

  free(ptask->wt_mutex);
  free(ptask);

  if (job_id != NULL)
    {
    pjob  = svr_find_job(job_id, FALSE);
    
    if (pjob != NULL)
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      int       job_state = -1;

      job_state = pjob->ji_qs.ji_state;

      // only do things for running jobs
      if (job_state == JOB_STATE_RUNNING)
        {
        job_mutex.unlock();

        get_svr_attr_l(SRV_ATR_JobStatRate, &job_stat_rate);

        if (time(NULL) - pjob->ji_last_reported_time > job_stat_rate)
          {
          get_svr_attr_l(SRV_ATR_PollJobs, &poll_jobs);
          if (poll_jobs)
            stat_mom_job(job_id);
          }

        /* add another task */
        set_task(WORK_Timed, time_now + (job_stat_rate / 3), poll_job_task, strdup(job_id), FALSE);
        }
      }
      
    free(job_id);
    }
  }  /* END poll_job_task() */
void queue_a_retry_task(

    batch_request *preq,                           /* I */
    void         (*replyfunc)(struct work_task *)) /* I */

{
    /* create a new batch_request because preq is going to be freed when issue_to_svr returns success */
    batch_request    *new_preq = duplicate_request(preq, -1);
    struct work_task *pwt;

    get_batch_request_id(new_preq);

    pwt = set_task(WORK_Timed, (time(NULL) + PBS_NET_RETRY_TIME), reissue_to_svr, new_preq->rq_id, TRUE);

    pwt->wt_parmfunc = replyfunc;

    pthread_mutex_unlock(pwt->wt_mutex);
} /* END queue_a_retry_task() */
struct psi_process *
psi_arch_process(const pid_t pid)
{
    struct kinfo_proc p;
    struct psi_process *proci;

    if (get_kinfo_proc(pid, &p) == -1) {
        return NULL;
    }

    proci = psi_calloc(sizeof(struct psi_process));
    if (proci == NULL) {
        return NULL;
    }

    if (set_exe(proci, &p) == -1) goto cleanup;
    if (set_cwd(proci, &p) == -1) goto cleanup;
    if (set_kp_proc(proci, &p) == -1) goto cleanup;
    if (set_kp_eproc(proci, &p) == -1) goto cleanup;
    if (set_task(proci, &p) == -1) goto cleanup;

    if (proci->utime_status == PSI_STATUS_PRIVS ||
                proci->stime_status == PSI_STATUS_PRIVS)
        proci->cputime_status = PSI_STATUS_PRIVS;
    else {
        proci->cputime = calc_cputime(proci->utime, proci->stime);
        proci->cputime_status = PSI_STATUS_OK;
    }

    if (proci->command_status == PSI_STATUS_PRIVS) {
        /* Ensure Process.command always has a value, as per our
         * contract with the user.
         */
        proci->command = psi_strdup("");
        proci->command_status = PSI_STATUS_OK;
    }
    
    return proci;

  cleanup:
    psi_free_process(proci);
    return NULL;
}
Beispiel #23
0
// Выполняется в прерывании таймера
inline uint8_t timer_manager(void) {
	uint8_t index;

	for (index = 0; index < MAIN_TIMER_QUEUE_SIZE; index++) {
		if (TimerQueue[index].task_ptr == NULL)
			continue;

		if (TimerQueue[index].task_time_elapsed > 0) {
			TimerQueue[index].task_time_elapsed--;
		} else {
			set_task(TimerQueue[index].task_ptr);

			if (TimerQueue[index].flags & FLAG_PERSISTENT)
				TimerQueue[index].task_time_elapsed = TimerQueue[index].task_time;
			else
				TimerQueue[index].task_ptr = NULL;
		}
	}

	return E_OK;
}
Beispiel #24
0
void chkpt_xfr_hold(

  struct work_task *ptask)

  {
  job                  *pjob;

  struct batch_request *preq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preq = get_remove_batch_request(ptask->wt_parm1);

  free(ptask->wt_mutex);
  free(ptask);

  if ((preq == NULL) ||
      (preq->rq_extra == NULL))
    return;

  if ((pjob = svr_find_job(preq->rq_extra, FALSE)) == NULL)
    return;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  free_br(preq);

  set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);

  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  return;
  }  /* END chkpt_xfr_hold() */
int apply_job_delete_nanny(

  struct job *pjob,
  int         delay)  /* I */

  {
  enum work_type    tasktype;
  long              nanny = FALSE;

  /* short-circuit if nanny isn't enabled or we have a delete nanny */
  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if ((nanny == FALSE) ||
      (pjob->ji_has_delete_nanny == TRUE))
    {
    return(PBSE_NONE);
    }

  if (delay == 0)
    {
    tasktype = WORK_Immed;
    }
  else if (delay > 0)
    {
    tasktype = WORK_Timed;
    }
  else
    {
    log_err(-1, __func__, "negative delay requested for nanny");

    return(-1);
    }

  pjob->ji_has_delete_nanny = TRUE;

  /* add a nanny task at the requested time */
  set_task(tasktype, delay, job_delete_nanny, strdup(pjob->ji_qs.ji_jobid), FALSE);

  return(PBSE_NONE);
  } /* END apply_job_delete_nanny() */
Beispiel #26
0
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int                   delay = 0;
  int                   dellen = strlen(deldelaystr);
  job                  *pjob;

  pbs_queue            *pque;

  char                 *preq_clt_id;
  struct batch_request *preq_sig;         /* signal request to MOM */

  struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;
  preq_clt_id = preq_sig->rq_extra;

  free_br(preq_sig);

  if (preq_clt_id != NULL)
    {
    preq_clt = get_remove_batch_request(preq_clt_id);
    free(preq_clt_id);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE);

  if (pjob == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */
  if (delay == 0)
    {
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
      pthread_mutex_unlock(server.sv_attr_mutex);
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    else if (pjob != NULL)
      return;
    }

  set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE);

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */
  apply_job_delete_nanny(pjob, time_now + delay + 60);

  unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  }  /* END post_delete_mom1() */
Beispiel #27
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  char             *sigt = "SIGTERM";

  int               has_mutex = TRUE;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);
    
    unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    strcat(log_buf, "\n");
    strcat(log_buf, Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1, strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
  
      unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL);
      }

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        return(-1);

      for (i = 0; i < pa->ai_qs.array_size; i++)
        {
        if (pa->job_ids[i] == NULL)
          continue;

        if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
          continue;

        if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
          {
          free(pa->job_ids[i]);
          pa->job_ids[i] = NULL;
          }
        else
          {
          if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
            {
            tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;
            
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
              }
            
            svr_evaljobstate(tmp, &newstate, &newsub, 1);
            svr_setjobstate(tmp, newstate, newsub, FALSE);
            job_save(tmp, SAVEJOB_FULL, 0);

            unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
            
            break;
            }

          unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
          }
        }

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_unlock(pa->ai_mutex);
      }
    } /* END MoabArrayCompatible check */

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */

    remove_stagein(&pjob);

    if (pjob != NULL)
      job_abt(&pjob, Msg);

    has_mutex = FALSE;
    }
  else
    {
    /*
     * the job is not transitting (though it may have been) and
     * is not running, so put in into a complete state.
     */
    struct pbs_queue *pque;
    int  KeepSeconds = 0;

    svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pque->qu_numcompleted++;

      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "calling on_job_exit from %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
    
      pthread_mutex_lock(server.sv_attr_mutex);
      KeepSeconds = attr_ifelse_long(
                    &pque->qu_attr[QE_ATR_KeepCompleted],
                    &server.sv_attr[SRV_ATR_KeepCompleted],
                    0);
      pthread_mutex_unlock(server.sv_attr_mutex);
      }
    else
      KeepSeconds = 0;

    if (pjob != NULL)
      {
      set_task(WORK_Timed, time_now + KeepSeconds, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);
      }
    else
      has_mutex = FALSE;
    }  /* END else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0) */

  if (has_mutex == TRUE)
    unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Beispiel #28
0
void delay_and_send_sig_kill(
    
  batch_request *preq_sig)

  {
  int                   delay = 0;
  job                  *pjob;

  pbs_queue            *pque;

  batch_request        *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq_sig == NULL)
    return;

  rc = preq_sig->rq_reply.brp_code;

  if (preq_sig->rq_extend != NULL)
    {
    preq_clt = get_remove_batch_request(preq_sig->rq_extend);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  if ((pjob = chk_job_request(preq_clt->rq_ind.rq_rerun, preq_clt)) == NULL)
    {
    /* job has gone away, chk_job_request() calls req_reject() on failure */
    return;
    }

  mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during rerun");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      pjob_mutex.unlock();
      req_reject(rc, 0, preq_clt, NULL, NULL);
      }

    return;
    }

  // Apply the user delay first so it takes precedence.
  if (pjob->ji_wattr[JOB_ATR_user_kill_delay].at_flags & ATR_VFLAG_SET)
    delay = pjob->ji_wattr[JOB_ATR_user_kill_delay].at_val.at_long;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    mutex_mgr pque_mutex = mutex_mgr(pque->qu_mutex, true);
    mutex_mgr server_mutex = mutex_mgr(server.sv_attr_mutex, false);

    if (delay == 0)
      {
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             0);
      }
    }
  else
    {
    /* why is the pque null. Something went wrong */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "jobid %s returned a null queue", pjob->ji_qs.ji_jobid);
    req_reject(PBSE_UNKQUE, 0, preq_clt, NULL, log_buf);
    return;
    }

  pjob_mutex.unlock();
  reply_ack(preq_clt);
  set_task(WORK_Timed, delay + time_now, send_sig_kill, strdup(pjob->ji_qs.ji_jobid), FALSE);
  } // END delay_and_send_sig_kill()
Beispiel #29
0
int send_job(

  job       *jobp,
  pbs_net_t  hostaddr, /* host address, host byte order */
  int        port, /* service port, host byte order */
  int        move_type, /* move, route, or execute */
  void (*post_func)(struct work_task *),     /* after move */
  void      *data)  /* ptr to optional batch_request to be put */
                    /* in the work task structure */

  {
  tlist_head  attrl;
  enum conn_type cntype = ToServerDIS;
  int    con;
  char  *destin = jobp->ji_qs.ji_destin;
  int    encode_type;
  int    i;
  int    NumRetries;

  char  *id = "send_job";

  attribute *pattr;

  pid_t  pid;

  struct attropl *pqjatr;      /* list (single) of attropl for quejob */
  char  *safail = "sigaction failed\n";
  char  *spfail = "sigprocmask failed\n";
  char   script_name[MAXPATHLEN + 1];
  sigset_t  child_set, all_set;

  struct  sigaction child_action;

  struct work_task *ptask;

  mbool_t        Timeout = FALSE;

  char          *pc;

  sigemptyset(&child_set);
  sigaddset(&child_set, SIGCHLD);
  sigfillset(&all_set);

  /* block SIGCHLD until work task is established */

  if (sigprocmask(SIG_BLOCK, &child_set, NULL) == -1)
    {
    log_err(errno,id,spfail);

    pbs_errno = PBSE_SYSTEM;

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "cannot set signal mask");

    return(ROUTE_PERM_FAILURE);
    }

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"about to send job - type=%d",
      move_type);
 
    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "forking in send_job");
    }

  pid = fork();

  if (pid == -1)
    {
    /* error on fork */

    log_err(errno, id, "fork failed\n");

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    pbs_errno = PBSE_SYSTEM;

    return(ROUTE_PERM_FAILURE);
    }

  if (pid != 0)
    {
    /* The parent (main server) */

    /* create task to monitor job startup */

    /* CRI:   need way to report to scheduler job is starting, not started */

    ptask = set_task(WORK_Deferred_Child, pid, post_func, jobp);

    if (ptask == NULL)
      {
      log_err(errno, id, msg_err_malloc);

      return(ROUTE_PERM_FAILURE);
      }

    ptask->wt_parm2 = data;

    append_link(
      &((job *)jobp)->ji_svrtask,
      &ptask->wt_linkobj,
      ptask);

    /* now can unblock SIGCHLD */

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    if (LOGLEVEL >= 1)
      {
      extern long   DispatchTime[];
      extern job   *DispatchJob[];
      extern char  *DispatchNode[];

      extern time_t time_now;

      struct pbsnode *NP;

      /* record job dispatch time */

      int jindex;

      for (jindex = 0;jindex < 20;jindex++)
        {
        if (DispatchJob[jindex] == NULL)
          {
          DispatchTime[jindex] = time_now;

          DispatchJob[jindex] = jobp;

          if ((NP = PGetNodeFromAddr(hostaddr)) != NULL)
            DispatchNode[jindex] = NP->nd_name;
          else
            DispatchNode[jindex] = NULL;

          break;
          }
        }
      }

    /* SUCCESS */

    return(ROUTE_DEFERRED);
    }  /* END if (pid != 0) */

  /*
   * the child process
   *
   * set up signal catcher for error return
   */

  rpp_terminate();

  child_action.sa_handler = net_move_die;

  sigfillset(&child_action.sa_mask);

  child_action.sa_flags = 0;

  if (sigaction(SIGHUP, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGINT, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGQUIT, &child_action, NULL))
    log_err(errno, id, safail);

  /* signal handling is set, now unblock */

  if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
    log_err(errno, id, spfail);

  /* encode job attributes to be moved */

  CLEAR_HEAD(attrl);

  /* select attributes/resources to send based on move type */

  if (move_type == MOVE_TYPE_Exec)
    {
    /* moving job to MOM - ie job start */

    resc_access_perm = ATR_DFLAG_MOM;
    encode_type = ATR_ENCODE_MOM;
    cntype = ToServerDIS;
    }
  else
    {
    /* moving job to alternate server? */

    resc_access_perm =
      ATR_DFLAG_USWR |
      ATR_DFLAG_OPWR |
      ATR_DFLAG_MGWR |
      ATR_DFLAG_SvRD;

    encode_type = ATR_ENCODE_SVR;

    /* clear default resource settings */

    svr_dequejob(jobp);
    }

  pattr = jobp->ji_wattr;

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    if (((job_attr_def + i)->at_flags & resc_access_perm) ||
      ((strncmp((job_attr_def + i)->at_name,"session_id",10) == 0) &&
      (jobp->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)))
      {
      (job_attr_def + i)->at_encode(
        pattr + i,
        &attrl,
        (job_attr_def + i)->at_name,
        NULL,
        encode_type);
      }
    }    /* END for (i) */

  attrl_fixlink(&attrl);

  /* put together the job script file name */

  strcpy(script_name, path_jobs);

  if (jobp->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    strcat(script_name, jobp->ji_arraystruct->ai_qs.fileprefix);
    }
  else
    {
    strcat(script_name, jobp->ji_qs.ji_fileprefix);
    }

  strcat(script_name, JOB_SCRIPT_SUFFIX);


  pbs_errno = 0;
  con = -1;

  for (NumRetries = 0;NumRetries < RETRY;NumRetries++)
    {
    int rc;

    /* connect to receiving server with retries */

    if (NumRetries > 0)
      {
      /* recycle after an error */

      if (con >= 0)
        svr_disconnect(con);

      /* check pbs_errno from previous attempt */

      if (should_retry_route(pbs_errno) == -1)
        {
        sprintf(log_buffer, "child failed in previous commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_err(pbs_errno, id, log_buffer);

        exit(1); /* fatal error, don't retry */
        }

      sleep(1 << NumRetries);
      }

    /* NOTE:  on node hangs, svr_connect is successful */

    if ((con = svr_connect(hostaddr, port, 0, cntype)) == PBS_NET_RC_FATAL)
      {
      sprintf(log_buffer, "send_job failed to %lx port %d",
        hostaddr,
        port);

      log_err(pbs_errno, id, log_buffer);

      exit(1);
      }

    if (con == PBS_NET_RC_RETRY)
      {
      pbs_errno = 0; /* should retry */

      continue;
      }

    /*
     * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
     * we are recovering after being down or a late failure, we
     * just want to send the "ready-to-commit/commit"
     */

    if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM)
      {
      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT)
        {
        jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;

        job_save(jobp, SAVEJOB_QUICK);
        }

      pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;

      if ((pc = PBSD_queuejob(
                  con,
                  jobp->ji_qs.ji_jobid,
                  destin,
                  pqjatr,
                  NULL)) == NULL)
        {
        if ((pbs_errno == PBSE_EXPIRED) || (pbs_errno == PBSE_READ_REPLY_TIMEOUT))
          {
          /* queue job timeout based on pbs_tcp_timeout */

          Timeout = TRUE;
          }

        if ((pbs_errno == PBSE_JOBEXIST) && (move_type == MOVE_TYPE_Exec))
          {
          /* already running, mark it so */

          log_event(
            PBSEVENT_ERROR,
            PBS_EVENTCLASS_JOB,
            jobp->ji_qs.ji_jobid,
            "MOM reports job already running");

          exit(0);
          }

        sprintf(log_buffer, "send of job to %s failed error = %d",
          destin,
          pbs_errno);

        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          jobp->ji_qs.ji_jobid,
          log_buffer);

        continue;
        }  /* END if ((pc = PBSD_queuejob() == NULL) */

      free(pc);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)
        {
        if (PBSD_jscript(con, script_name, jobp->ji_qs.ji_jobid) != 0)
          continue;
        }

      /* XXX may need to change the logic below, if we are sending the job to
         a mom on the same host and the mom and server are not sharing the same
         spool directory, then we still need to move the file */

      if ((move_type == MOVE_TYPE_Exec) &&
          (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
          (hostaddr != pbs_server_addr))
        {
        /* send files created on prior run */

        if ((move_job_file(con,jobp,StdOut) != 0) ||
            (move_job_file(con,jobp,StdErr) != 0) ||
            (move_job_file(con,jobp,Checkpoint) != 0))
          {
          continue;
          }
        }

      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;

      job_save(jobp, SAVEJOB_QUICK);
      }
    else
      {
      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");
      }

    if (PBSD_rdytocmt(con, jobp->ji_qs.ji_jobid) != 0)
      {
      if (sigprocmask(SIG_UNBLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      continue;
      }


    if ((rc = PBSD_commit(con, jobp->ji_qs.ji_jobid)) != 0)
      {
      int errno2;

      /* NOTE:  errno is modified by log_err */

      errno2 = errno;

      sprintf(log_buffer, "send_job commit failed, rc=%d (%s)",
              rc,
              (connection[con].ch_errtxt != NULL) ? connection[con].ch_errtxt : "N/A");

      log_ext(errno2, id, log_buffer, LOG_WARNING);

      /* if failure occurs, pbs_mom should purge job and pbs_server should set *
         job state to idle w/error msg */

      if (errno2 == EINPROGRESS)
        {
        /* request is still being processed */

        /* increase tcp_timeout in qmgr? */

        Timeout = TRUE;

        /* do we need a continue here? */

        sprintf(log_buffer, "child commit request timed-out for job %s, increase tcp_timeout?",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_WARNING);

        /* don't retry on timeout--break out and report error! */

        break;
        }
      else
        {
        sprintf(log_buffer, "child failed in commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_CRIT);

        /* FAILURE */

        exit(1);
        }
      }    /* END if ((rc = PBSD_commit(con,jobp->ji_qs.ji_jobid)) != 0) */

    svr_disconnect(con);

    /* child process is done */

    /* SUCCESS */

    exit(0);
    }  /* END for (NumRetries) */

  if (con >= 0)
    svr_disconnect(con);

  if (Timeout == TRUE)
    {
    /* 10 indicates that job migrate timed out, server will mark node down *
          and abort the job - see post_sendmom() */

    sprintf(log_buffer, "child timed-out attempting to start job %s",
            jobp->ji_qs.ji_jobid);

    log_ext(pbs_errno, id, log_buffer, LOG_WARNING);

    exit(10);
    }

  if (should_retry_route(pbs_errno) == -1)
    {
    sprintf(log_buffer, "child failed and will not retry job %s",
      jobp->ji_qs.ji_jobid);

    log_err(pbs_errno, id, log_buffer);

    exit(1);
    }

  exit(2);

  /*NOTREACHED*/

  return(ROUTE_SUCCESS);
  }  /* END send_job() */
Beispiel #30
0
acl_int64 aio_timer_callback::trigger(void)
{
	// sanity check
	if (tasks_.empty())
		return TIMER_EMPTY;

	acl_assert(length_ > 0);

	set_time();

	std::list<aio_timer_task*>::iterator it, next;
	std::list<aio_timer_task*> tasks;
	aio_timer_task* task;

	// 从定时器中取出到达的定时任务
	for (it = tasks_.begin(); it != tasks_.end(); it = next)
	{
		if ((*it)->when > present_)
			break;
		next = it;
		++next;
		task = *it;
		tasks_.erase(it);
		length_--;
		tasks.push_back(task);
	}

	// 有可能这些到达的定时任务已经被用户提前删除了
	if (tasks.empty())
	{
		acl_assert(!tasks_.empty());

		aio_timer_task* first = tasks_.front();
		acl_int64 delay = first->when - present_;
		return delay < 0 ? 0 : delay;
	}

	// 将到达的定时任务重新放回至定时器的任务列表中,
	// 并开始触发所有的到达的定时任务

	// 必须先设置触发器的忙状态,以防止子类在回调过程
	// 中调用了该类对象的析构过程
	set_locked();

	// 设置解锁后销毁标志为 false,因为当前该定时器处于
	// 锁定状态,所以其它类对象不能直接在锁定时销毁本类
	// 对象,当解锁后,如果该标识被置为 true,则本类对象
	// 应该自动销毁
	destroy_on_unlock_ = false;

	for (it = tasks.begin(); it != tasks.end(); ++it)
	{
		set_task(*it);
		timer_callback((*it)->id);
	}

	tasks.clear();

	// 允许之后的操作中被子类调用析构过程
	unset_locked();

	// 子类有可能会在 timer_callback 中删除了所有的定时任务
	if (tasks_.empty())
		return TIMER_EMPTY;

	aio_timer_task* first = tasks_.front();
	acl_int64 delay = first->when - present_;

	// 如果在加锁期间外部程序要求释放该对象,则在此处释放
	if (destroy_on_unlock_)
	{
		destroy();
		return -1;
	}
	return delay < 0 ? 0 : delay;
}