Example #1
0
void *send_power_state_to_mom(
    
  void *arg)

  {
  struct batch_request  *pRequest = (struct batch_request *)arg;
  struct pbsnode        *pNode = find_nodebyname(pRequest->rq_host);

  if (pNode == NULL)
    {
    free_br(pRequest);
    return NULL;
    }

  int handle = 0;
  int local_errno = 0;

  handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
  if (handle < 0)
    {
    unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
    return NULL;
    }

  unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
  issue_Drequest(handle, pRequest, true);

  return NULL;
  }
void *check_if_orphaned(

  void *vp)

  {
  char                 *rsv_id = (char *)vp;
  char                  job_id[PBS_MAXSVRJOBID];
  struct batch_request *preq;
  int                   handle = -1;
  int                   retries = 0;
  struct pbsnode       *pnode;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  if (is_orphaned(rsv_id, job_id) == TRUE)
    {
    if((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      return NULL;
    preq->rq_extend = rsv_id;

    /* Assume the request will be successful and remove the RSV from the hash table */
    remove_alps_reservation(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id,
        pnode->nd_name);
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS);
        retries++;
        }

      /* unlock before the network transaction */
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }
    }
  else
    free(rsv_id);

  return(NULL);
  } /* END check_if_orphaned() */
Example #3
0
int
relay_to_mom2(job *pjob, struct batch_request *request,
	     void (*func)(struct work_task *), struct work_task **ppwt)
{
	int	rc;
	int	conn;	/* a client style connection handle */
	pbs_net_t    momaddr;
	unsigned int momport;
	struct work_task *pwt;
	int prot;
	mominfo_t *pmom = 0;
	pbs_list_head	*mom_tasklist_ptr = NULL;

	momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
	momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;

	if (pbs_conf.pbs_use_tcp == 1) {
		prot = PROT_RPP;
		pmom = tfind2((unsigned long) momaddr, momport, &ipaddrs);
		if (!pmom || (((mom_svrinfo_t *) (pmom->mi_data))->msr_state & INUSE_DOWN)) {
			return (PBSE_NORELYMOM);
		}
		mom_tasklist_ptr = &(((mom_svrinfo_t *) (pmom->mi_data))->msr_deferred_cmds);
	} else {
		prot = PROT_TCP;
	}

	conn = svr_connect(momaddr, momport, process_Dreply, ToServerDIS, prot);
	if (conn < 0) {
		log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_WARNING, "", msg_norelytomom);
		return (PBSE_NORELYMOM);
	}

	request->rq_orgconn = request->rq_conn;	/* save client socket */
	pbs_errno = 0;
	rc = issue_Drequest(conn, request, func, &pwt, prot);
	if ((rc == 0) && (func != release_req)) {
		/* work-task entry job related rpp, link to the job's list */
		append_link(&pjob->ji_svrtask, &pwt->wt_linkobj, pwt);
		if (prot == PROT_RPP)
			append_link(mom_tasklist_ptr, &pwt->wt_linkobj2, pwt); /* if rpp, link to mom list as well */
	}

	if (ppwt != NULL)
		*ppwt = pwt;

	/*
	 * We do not want req_reject() to send non PBSE error numbers.
	 * Check for internal errors and when found return PBSE_SYSTEM.
	 */
	if ((rc != 0) && (pbs_errno == 0))
		return (PBSE_SYSTEM);
	else
		return (rc);
}
Example #4
0
int
issue_to_svr(char *servern, struct batch_request *preq, void (*replyfunc)(struct work_task *))
{
	int	  do_retry = 0;
	int	  handle;
	pbs_net_t svraddr;
	char	 *svrname;
	unsigned int  port = pbs_server_port_dis;
	struct work_task *pwt;
	extern int pbs_failover_active;
	extern char primary_host[];
	extern char server_host[];

	(void)strcpy(preq->rq_host, servern);
	preq->rq_fromsvr = 1;
	preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;
	svrname = parse_servername(servern, &port);

	if ((pbs_failover_active != 0) && (svrname != NULL)) {
		/* we are the active secondary server in a failover config    */
		/* if the message is going to the primary,then redirect to me */
		size_t len;

		len = strlen(svrname);
		if (strncasecmp(svrname, primary_host, len) == 0) {
			if ((primary_host[(int)len] == '\0') ||
				(primary_host[(int)len] == '.'))
				svrname = server_host;
		}
	}
	svraddr = get_hostaddr(svrname);
	if (svraddr == (pbs_net_t)0) {
		if (pbs_errno == PBS_NET_RC_RETRY)
			/* Non fatal error - retry */
			do_retry = 1;
	} else {
		handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS, PROT_TCP);
		if (handle >= 0)
			return (issue_Drequest(handle, preq, replyfunc, 0, 0));
		else if (handle == PBS_NET_RC_RETRY)
			do_retry = 1;
	}

	/* if reached here, it didn`t go, do we retry? */

	if (do_retry) {
		pwt = set_task(WORK_Timed, (long)(time_now+(2*PBS_NET_RETRY_TIME)),
			reissue_to_svr, (void *)preq);
		pwt->wt_parm2 = (void *)replyfunc;
		return (0);
	} else
		return (-1);
}
Example #5
0
void *check_if_orphaned(

  void *vp)

  {
  char                 *rsv_id = (char *)vp;
  struct batch_request *preq;
  int                   handle = -1;
  int                   retries = 0;
  struct pbsnode       *pnode;

  if (is_orphaned(rsv_id) == TRUE)
    {
    preq = alloc_br(PBS_BATCH_DeleteReservation);
    preq->rq_extend = rsv_id;

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS);
        retries++;
        }

      /* unlock before the network transaction */
      unlock_node(pnode, __func__, NULL, 0);
      
      if (handle >= 0)
        {
        issue_Drequest(handle, preq, release_req, 0);
        }
      else
        free_br(preq);
      }
    }
  else
    free(rsv_id);

  return(NULL);
  } /* END check_if_orphaned() */
Example #6
0
int relay_to_mom(

    job                   **pjob_ptr,
    struct batch_request   *request, /* the request to send */
    void                  (*func)(struct work_task *))

{
    int             handle; /* a client style connection handle */
    int             rc;
    int             local_errno = 0;
    pbs_net_t       addr;
    unsigned short  port;
    job            *pjob = *pjob_ptr;
    char            jobid[PBS_MAXSVRJOBID + 1];
    char           *job_momname = NULL;

    struct pbsnode *node;
    char            log_buf[LOCAL_LOG_BUF_SIZE];
    std::string     node_name;

    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "attempting to send a request to %s's mom but no exec_host list?",
                 pjob->ji_qs.ji_jobid);
        log_err(PBSE_BADSTATE, __func__, log_buf);

        return(PBSE_BADSTATE);
    }

    /* if MOM is down don't try to connect */
    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
    job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    if (job_momname == NULL)
        return PBSE_MEM_MALLOC;

    if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
        free(job_momname);
        return(PBSE_NORELYMOM);
    }
    free(job_momname);

    if ((node != NULL) &&
            ((node->nd_state & INUSE_NOT_READY)||
             (node->nd_power_state != POWER_STATE_RUNNING)))
    {
        node->unlock_node(__func__, "no relay mom", LOGLEVEL);
        return(PBSE_NORELYMOM);
    }

    if (LOGLEVEL >= 7)
    {
        char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
        sprintf(log_buf, "momaddr=%s",tmp);

        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

        free(tmp);
    }

    node_name = node->get_name();

    node->unlock_node(__func__, "after svr_connect", LOGLEVEL);

    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
    *pjob_ptr = NULL;

    handle = svr_connect(addr, port, &local_errno, NULL, NULL);

    if (handle < 0)
    {
        update_failure_counts(node_name.c_str(), -1);
        log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

        return(PBSE_NORELYMOM);
    }

    request->rq_orgconn = request->rq_conn; /* save client socket */

    rc = issue_Drequest(handle, request, true);

    if (request->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
    else
        update_failure_counts(node_name.c_str(), 0);

    *pjob_ptr = svr_find_job(jobid, TRUE);

    return(rc);
}  /* END relay_to_mom() */
int relay_to_mom(

  job                   **pjob_ptr,
  struct batch_request   *request, /* the request to send */
  void                  (*func)(struct work_task *))

  {
  int             handle; /* a client style connection handle */
  int             rc;
  int             local_errno = 0;
  pbs_net_t       addr;
  unsigned short  port;
  job            *pjob = *pjob_ptr;
  char            jobid[PBS_MAXSVRJOBID + 1];
  char *job_momname = NULL;

  struct pbsnode *node;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* if MOM is down don't try to connect */
  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  port = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((node = tfind_addr(addr, port, job_momname)) == NULL)
    {
    free(job_momname);
    return(PBSE_NORELYMOM);
    }
  free(job_momname);

  if ((node != NULL) &&
      (node->nd_state & INUSE_DOWN))
    {
    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    char *tmp = netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
    sprintf(log_buf, "momaddr=%s",tmp);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    free(tmp);
    }

  unlock_node(node, __func__, "after svr_connect", LOGLEVEL);
  handle = svr_connect(
           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           &local_errno,
           NULL,
           NULL,
           ToServerDIS);
    

  if (handle < 0)
    {
    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST,"",msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  strcpy(jobid, pjob->ji_qs.ji_jobid);
  unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(handle, request);

  *pjob_ptr = svr_find_job(jobid, TRUE);

  return(rc);
  }  /* END relay_to_mom() */
int issue_to_svr(

  char                 *servern,                  /* I */
  struct batch_request *preq,                     /* I */
  void (*replyfunc)    (struct work_task *))      /* I */

  {
  int               rc = PBSE_NONE;
  int               do_retry = 0;
  int               handle;
  int               my_err = 0;
  pbs_net_t         svraddr;
  char             *svrname;
  unsigned int      port = pbs_server_port_dis;

  struct work_task *pwt;
  time_t            time_now = time(NULL);

  snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern);

  preq->rq_fromsvr = 1;
  preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

  svrname = parse_servername(servern, &port); 
  svraddr = get_hostaddr(&my_err,svrname);

  free(svrname);

  if (svraddr == (pbs_net_t)0)
    {
    if (my_err == PBS_NET_RC_RETRY)
      {
      /* Non fatal error - retry */

      do_retry = 1;
      }
    }
  else
    {
    handle = svr_connect(svraddr, port, &my_err, NULL, NULL, ToServerDIS);

    if (handle >= 0)
      {
      if (((rc = issue_Drequest(handle, preq)) == PBSE_NONE) &&
          (handle != PBS_LOCAL_CONNECTION))
        {
        /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply 
         * has always been sent */
        rc = preq->rq_reply.brp_code;
        }

      return(rc);
      }
    else if (handle == PBS_NET_RC_RETRY)
      {
      do_retry = 1;
      }
    }

  /* if reached here, it didn`t go, do we retry? */

  if (do_retry)
    {
    if (preq->rq_id == NULL)
      get_batch_request_id(preq);

    pwt = set_task(WORK_Timed, (long)(time_now + PBS_NET_RETRY_TIME), reissue_to_svr, preq->rq_id, TRUE);

    pwt->wt_parmfunc = replyfunc;

    pthread_mutex_unlock(pwt->wt_mutex);

    return(PBSE_NONE);
    }

  /* FAILURE */

  return(PBSE_INTERNAL);
  }  /* END issue_to_svr() */
Example #9
0
int stat_to_mom(

  job              *pjob,  /* I */
  struct stat_cntl *cntl)  /* I/O */

  {

  struct batch_request *newrq;
  int          rc;

  struct work_task     *pwt = 0;

  struct pbsnode       *node;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    return(PBSE_SYSTEM);
    }

  /* set up status request, save address of cntl in request for later */

  newrq->rq_extra = (void *)cntl;

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, pjob->ji_qs.ji_jobid);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */

  if (((node = tfind_addr(pjob->ji_qs.ji_un.ji_exect.ji_momaddr)) != NULL) &&
      (node->nd_state & (INUSE_DELETED | INUSE_DOWN)))
    {
    if (LOGLEVEL >= 6)
      {
      sprintf(log_buffer, "node '%s' is allocated to job but in state '%s'",
              node->nd_name,
              (node->nd_state & INUSE_DELETED) ? "deleted" : "down");

      log_event(
        PBSEVENT_SYSTEM,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      }

    return(PBSE_NORELYMOM);
    }

  /* get connection to MOM */

  cntl->sc_conn = svr_connect(
                    pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
                    pbs_mom_port,
                    process_Dreply,
                    ToServerDIS);

  if ((rc = cntl->sc_conn) >= 0)
    rc = issue_Drequest(cntl->sc_conn, newrq, stat_update, &pwt);

  if (rc != 0)
    {
    /* request failed */

    if (pwt)
      delete_task(pwt);

    free_br(newrq);

    if (cntl->sc_conn >= 0)
      svr_disconnect(cntl->sc_conn);
    }  /* END if (rc != NULL) */

  return(rc);
  }  /* END stat_to_mom() */
void *check_if_orphaned(

  void *vp)

  {
  char           *node_name = (char *)vp;
  char           *rsv_id = NULL;
  std::string     job_id;
  batch_request  *preq;
  int             handle = -1;
  int             retries = 0;
  struct pbsnode *pnode;
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  if ((rsv_id = strchr(node_name, ':')) != NULL)
    {
    *rsv_id = '\0';
    rsv_id++;
    }
  else
    {
    free(node_name);
    return(NULL);
    }

  if (alps_reservations.is_orphaned(rsv_id, job_id) == true)
    {
    // Make sure the node with the orphan is not available for jobs
    if ((pnode = find_nodebyname(node_name)) != NULL)
      {
      if ((pnode->nd_state & (INUSE_BUSY | INUSE_DOWN)) == 0)
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node %s has an orphan but wasn't marked as busy. Marking as busy now.",
          node_name);
        log_err(-1, __func__, log_buf);

        update_node_state(pnode, INUSE_BUSY);
        }

      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      }

    if ((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      {
      free(node_name);
      alps_reservations.remove_from_orphaned_list(rsv_id);
      return(NULL);
      }

    preq->rq_extend = strdup(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id.c_str(),
        pnode->get_name());
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL);
        retries++;
        }

      /* unlock before the network transaction */
      pnode->unlock_node(__func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }

    alps_reservations.remove_from_orphaned_list(rsv_id);
    }

  free(node_name);

  return(NULL);
  } /* END check_if_orphaned() */
Example #11
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int                   handle = -1;
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;
  char                 *job_momname = NULL;
  job                  *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return(PBSE_JOBNOTFOUND);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || 
      (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
    {
    job_mutex.unlock();
    snprintf(log_buf, sizeof(log_buf),
      "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid);
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return PBSE_BAD_PARAMETER;
    }

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  job_mutex.unlock();

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL);

  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return(rc);
  }  /* END stat_to_mom() */
Example #12
0
int issue_to_svr(

  char                 *servern,                  /* I */
  struct batch_request *preq,                     /* I */
  void (*replyfunc)    (struct work_task *))      /* I */

  {
  int   do_retry = 0;
  int   handle;
  pbs_net_t svraddr;
  char  *svrname;
  unsigned int  port = pbs_server_port_dis;

  struct work_task *pwt;

  strcpy(preq->rq_host, servern);

  preq->rq_fromsvr = 1;
  preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

  svrname = parse_servername(servern, &port); 
  svraddr = get_hostaddr(svrname);

  if (svraddr == (pbs_net_t)0)
    {
    if (pbs_errno == PBS_NET_RC_RETRY)
      {
      /* Non fatal error - retry */

      do_retry = 1;
      }
    }
  else
    {
    handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS);

    if (handle >= 0)
      {
      return(issue_Drequest(handle, preq, replyfunc, NULL));
      }
    else if (handle == PBS_NET_RC_RETRY)
      {
      do_retry = 1;
      }
    }

  /* if reached here, it didn`t go, do we retry? */

  if (do_retry)
    {
    pwt = set_task(
            WORK_Timed,
            (long)(time_now + PBS_NET_RETRY_TIME),
            reissue_to_svr,
            (void *)preq);

    pwt->wt_parmfunc = replyfunc;

    return(0);
    }

  /* FAILURE */

  return(-1);
  }  /* END issue_to_svr() */
Example #13
0
int req_gpuctrl_svr(
    
  struct batch_request *preq)

  {
  int rc = PBSE_NONE;
  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
  char   log_buf[LOCAL_LOG_BUF_SIZE+1];
  int    local_errno = 0;
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    conn;

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buf,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, __func__, log_buf, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return PBSE_UNKNODE;
    }

  /* validate that the node is up */

  if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    rc = PBSE_UNKREQ;
    sprintf(log_buf,"Node %s is not available",pnode->nd_name);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "No action specified");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  unlock_node(pnode, __func__, NULL, LOGLEVEL);
  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           &local_errno,
           NULL,
           NULL);
    

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE)
      req_reject(rc, 0, preq, NULL, NULL);
    else
      process_gpu_request_reply(preq);
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

  return rc;
  }
Example #14
0
/**
 * @brief
 * 	main - the initialization and main loop of pbs_daemon
 */
int
main(int argc, char *argv[])
{
	char		jobfile[MAXPATHLEN+1];
	char		jobfile_full[MAXPATHLEN+1];
	pbs_net_t	hostaddr = 0;
	int			port = -1;
	int			move_type = -1;
	pbs_list_head	attrl;
	enum conn_type  cntype = ToServerDIS;
	int		con = -1;
	char		*destin;
	int			encode_type;
	int			i;
	job			*jobp;
	char		 job_id[PBS_MAXSVRJOBID+1];
	attribute	*pattr;
	struct attropl  *pqjatr;      /* list (single) of attropl for quejob */
	char		 script_name[MAXPATHLEN+1];
	int			in_server = -1;
	char		*param_name, *param_val;
	char		buf[4096];
	struct  hostent *hp;
	struct in_addr  addr;
	char            *credbuf = NULL;
	size_t		credlen = 0;
	int 		prot = PROT_TCP;
	/*the real deal or output version and exit?*/

	execution_mode(argc, argv);

	/* If we are not run with real and effective uid of 0, forget it */

	pbs_loadconf(0);

	if (!isAdminPrivilege(getlogin())) {
		fprintf(stderr, "%s: Must be run by root\n", argv[0]);
		exit(SEND_JOB_FATAL);
	}

	/* initialize the pointers in the resource_def array */
	for (i = 0; i < (svr_resc_size - 1); ++i)
		svr_resc_def[i].rs_next = &svr_resc_def[i+1];
	/* last entry is left with null pointer */
	/* set single threaded mode */
	pbs_client_thread_set_single_threaded_mode();
	/* disable attribute verification */
	set_no_attribute_verification();

	/* initialize the thread context */
	if (pbs_client_thread_init_thread_context() != 0) {
		fprintf(stderr, "%s: Unable to initialize thread context\n",
			argv[0]);
		exit(SEND_JOB_FATAL);
	}

	if(set_msgdaemonname("PBS_send_job")) {
		fprintf(stderr, "Out of memory\n");
		return 1;
	}

	winsock_init();

	connection_init();

	while (fgets(buf, sizeof(buf), stdin) != NULL) {
		buf[strlen(buf)-1] = '\0';	/* gets rid of newline */

		param_name = buf;
		param_val = strchr(buf, '=');
		if (param_val) {
			*param_val = '\0';
			param_val++;
		} else {	/* bad param_val -- skipping */
			break;
		}

		if (strcmp(param_name, "jobfile") == 0) {
			jobfile[0] = '\0';
			strncpy(jobfile, param_val, MAXPATHLEN);
		} else if (strcmp(param_name, "destaddr") == 0) {
			hostaddr = atol(param_val);
		} else if (strcmp(param_name, "destport") == 0) {
			port = atoi(param_val);
		} else if (strcmp(param_name, "move_type") == 0) {
			move_type = atoi(param_val);
		} else if (strcmp(param_name, "in_server") == 0) {
			in_server = atoi(param_val);
		} else if (strcmp(param_name, "server_name") == 0) {
			server_name[0] = '\0';
			strncpy(server_name, param_val, PBS_MAXSERVERNAME);
		} else if (strcmp(param_name, "server_host") == 0) {
			server_host[0] = '\0';
			strncpy(server_host, param_val, (sizeof(server_host) - 1));
		} else if (strcmp(param_name, "server_addr") == 0) {
			pbs_server_addr = atol(param_val);
		} else if (strcmp(param_name, "server_port") == 0) {
			pbs_server_port_dis = atoi(param_val);
		} else if (strcmp(param_name, "log_file") == 0) {
			log_file = strdup(param_val);
		} else if (strcmp(param_name, "path_log") == 0) {
			path_log[0] = '\0';
			strncpy(path_log, param_val, MAXPATHLEN);
		} else if (strcmp(param_name, "path_jobs") == 0) {
			path_jobs = strdup(param_val);
		} else if (strcmp(param_name, "path_spool") == 0) {
			path_spool = strdup(param_val);
		} else if (strcmp(param_name, "path_rescdef") == 0) {
			path_rescdef = strdup(param_val);
		} else if (strcmp(param_name, "path_users") == 0) {
			path_users = strdup(param_val);
		} else if (strcmp(param_name, "path_hooks_workdir") == 0) {
			path_hooks_workdir = strdup(param_val);
			if (path_hooks_workdir == NULL)
				exit(SEND_JOB_FATAL);
		} else if (strcmp(param_name, "svr_history_enable") == 0) {
			svr_history_enable = atol(param_val);
		} else if (strcmp(param_name, "svr_history_duration") == 0) {
			svr_history_duration = atol(param_val);
		} else if (strcmp(param_name, "single_signon_password_enable") == 0) {
			if (decode_b(&server.sv_attr[(int)SRV_ATR_ssignon_enable],
				NULL, NULL, param_val) != 0) {
				fprintf(stderr, "%s: failed to set ssignon_password_enable\n", argv[0]);
				exit(SEND_JOB_FATAL);
			}
		} else if (strcmp(param_name, "script_name") == 0) {
			strncpy(script_name, param_val, MAXPATHLEN + 1);
		} else
			break;
	}

	time(&time_now);

	(void)log_open_main(log_file, path_log, 1); /* silent open */

	if (setup_resc(1) == -1) {
		/* log_buffer set in setup_resc */
		log_err(-1, "pbsd_send_job(setup_resc)", log_buffer);
		return (-1);
	}

	if( strlen(jobfile) == 0 || hostaddr == 0 || port == 0 ||  move_type == -1 || \
			in_server == -1 || strlen(server_name) == 0 || strlen(server_host) == 0 || \
			pbs_server_addr == 0 || pbs_server_port_dis == 0 || \
			strlen(path_log) == 0 || path_jobs == NULL || \
			path_spool == NULL || path_users == NULL ) {
		log_err(-1, "pbs_send_job", "error on one of the parameters");
		log_close(0);	/* silent close */
		exit(SEND_JOB_FATAL);
	}

	CLEAR_HEAD(task_list_immed);
	CLEAR_HEAD(task_list_timed);
	CLEAR_HEAD(task_list_event);
	CLEAR_HEAD(svr_queues);
	CLEAR_HEAD(svr_alljobs);
	CLEAR_HEAD(svr_newjobs);
	CLEAR_HEAD(svr_allresvs);
	CLEAR_HEAD(svr_newresvs);
	CLEAR_HEAD(svr_deferred_req);
	CLEAR_HEAD(svr_unlicensedjobs);

	strcpy(jobfile_full, path_jobs);
	strcat(jobfile_full, jobfile);

	if (chk_save_file(jobfile_full) != 0) {
		sprintf(log_buffer, "Error opening jobfile=%s", jobfile);
		log_err(-1, __func__, log_buffer);
		goto fatal_exit;
	}

	if ((jobp=job_recov_fs(jobfile, RECOV_SUBJOB)) == NULL) {
		sprintf(log_buffer, "Failed to recreate job in jobfile=%s", jobfile);
		log_err(-1, __func__, log_buffer);
		goto fatal_exit;
	}

	/* now delete the temp job file that was created by job_save_fs in server code
	 * jobs are in database now, no need to keep in filesystem
	 */
	unlink(jobfile_full);

	if (in_server)
		append_link(&svr_alljobs, &jobp->ji_alljobs, jobp);


	/* select attributes/resources to send based on move type */

	if (move_type == MOVE_TYPE_Exec) {
		resc_access_perm = ATR_DFLAG_MOM;
		encode_type = ATR_ENCODE_MOM;
		cntype = ToServerDIS;
	} else {
		resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR |
			ATR_DFLAG_MGWR | ATR_DFLAG_SvRD;
		encode_type = ATR_ENCODE_SVR;
		svr_dequejob(jobp);
	}

	CLEAR_HEAD(attrl);
	pattr = jobp->ji_wattr;
	for (i=0; i < (int)JOB_ATR_LAST; i++) {
		if ((job_attr_def+i)->at_flags & resc_access_perm) {
			(void)(job_attr_def+i)->at_encode(pattr+i, &attrl,
				(job_attr_def+i)->at_name, NULL,
				encode_type, NULL);
		}
	}
	attrl_fixlink(&attrl);

	/* script name is passed from parent */

	/* get host name */
	pbs_loadconf(0);
	addr.s_addr = htonl(hostaddr);
	hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET);
	if (hp == NULL) {
		sprintf(log_buffer, "%s: h_errno=%d",
			inet_ntoa(addr), h_errno);
		log_err(-1, __func__, log_buffer);
	}
	else {
		/* read any credential file */
		(void)get_credential(hp->h_name, jobp,  PBS_GC_BATREQ,
			&credbuf, &credlen);
	}
	/* save the job id for when after we purge the job */

	(void)strcpy(job_id, jobp->ji_qs.ji_jobid);

	con = -1;

	DIS_tcparray_init();

	for (i=0; i<RETRY; i++) {

		pbs_errno = 0;
		/* connect to receiving server with retries */

		if (i > 0) {	/* recycle after an error */
			if (con >= 0)
				svr_disconnect(con);
			if (should_retry_route(pbs_errno) == -1) {
				goto fatal_exit;	/* fatal error, don't retry */
			}
			sleep(1<<i);
		}
		if ((con = svr_connect(hostaddr, port, 0, cntype, prot)) ==
			PBS_NET_RC_FATAL) {
			(void)sprintf(log_buffer, "send_job failed to %lx port %d",
				hostaddr, port);
			log_err(pbs_errno, __func__, log_buffer);
			goto fatal_exit;
		} else if (con == PBS_NET_RC_RETRY) {
			pbs_errno = WSAECONNREFUSED;	/* should retry */
			continue;
		}
		/*
		 * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
		 * we are recovering after being down or a late failure, we
		 * just want to send the "read-to-commit/commit"
		 */


		if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) {

			if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) {
				jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
			}

			pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;
			destin = jobp->ji_qs.ji_destin;

			if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL, prot, NULL)== 0) {
				if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) {
					/* already running, mark it so */
					log_event(PBSEVENT_ERROR,
						PBS_EVENTCLASS_JOB, LOG_INFO,
						jobp->ji_qs.ji_jobid, "Mom reports job already running");
					goto ok_exit;

				} else if ((pbs_errno == PBSE_HOOKERROR) ||
					(pbs_errno == PBSE_HOOK_REJECT)  ||
					(pbs_errno == PBSE_HOOK_REJECT_RERUNJOB)  ||
					(pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) {
					char		name_buf[MAXPATHLEN+1];
					int		rfd;
					int		len;
					char		*reject_msg;
					int		err;

					err = pbs_errno;

					reject_msg = pbs_geterrmsg(con);
					(void)snprintf(log_buffer, sizeof(log_buffer),
						"send of job to %s failed error = %d reject_msg=%s",
						destin, err,
						reject_msg?reject_msg:"");
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);

					(void)strcpy(name_buf, path_hooks_workdir);
					(void)strcat(name_buf, jobp->ji_qs.ji_jobid);
					(void)strcat(name_buf, HOOK_REJECT_SUFFIX);

					if ((reject_msg != NULL) &&
						(reject_msg[0] != '\0')) {

						if ((rfd = open(name_buf,
							O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) {
							snprintf(log_buffer,
								sizeof(log_buffer),
								"open of reject file %s failed: errno %d",
								name_buf, errno);
							log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer);
						} else {
							secure_file(name_buf, "Administrators",
								READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
							setmode(rfd, O_BINARY);
							len = strlen(reject_msg)+1;
							/* write also trailing null char */
							if (write(rfd, reject_msg, len) != len) {
								snprintf(log_buffer,
									sizeof(log_buffer),
									"write to file %s incomplete: errno %d", name_buf, errno);
								log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer);
							}
							close(rfd);
						}
					}

					if (err == PBSE_HOOKERROR)
						exit(SEND_JOB_HOOKERR);
					if (err == PBSE_HOOK_REJECT)
						exit(SEND_JOB_HOOK_REJECT);
					if (err == PBSE_HOOK_REJECT_RERUNJOB)
						exit(SEND_JOB_HOOK_REJECT_RERUNJOB);
					if (err == PBSE_HOOK_REJECT_DELETEJOB)
						exit(SEND_JOB_HOOK_REJECT_DELETEJOB);
				} else {
					(void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno);
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer);
					continue;
				}
			}

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				if (PBSD_jscript(con, script_name, prot, NULL) != 0)
					continue;
			}

			if (credlen > 0) {
				int     ret;

				ret = PBSD_jcred(con,
					jobp->ji_extended.ji_ext.ji_credtype,
					credbuf, credlen, prot, NULL);
				if ((ret == 0) || (i == (RETRY - 1)))
					free(credbuf);	/* free credbuf if credbuf is sent successfully OR */
				/* at the end of all retry attempts */
				if (ret != 0)
					continue;
			}

			if ((move_type == MOVE_TYPE_Exec) &&
				(jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
				(hostaddr !=  pbs_server_addr)) {
				/* send files created on prior run */
				if ((move_job_file(con, jobp, StdOut, prot) != 0) ||
					(move_job_file(con, jobp, StdErr, prot) != 0) ||
					(move_job_file(con, jobp, Chkpt, prot) != 0))
					continue;
			}

			jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;
		}

		if (PBSD_rdytocmt(con, job_id, prot, NULL) != 0)
			continue;

		if (PBSD_commit(con, job_id, prot, NULL) != 0)
			goto fatal_exit;
		goto ok_exit;	/* This child process is all done */
	}
	if (con >= 0)
		svr_disconnect(con);
	/*
	 * If connection is actively refused by the execution node(or mother superior) OR
	 * the execution node(or mother superior) is rejecting request with error
	 * PBSE_BADHOST(failing to authorize server host), the node should be marked down.
	 */
	if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == WSAECONNREFUSED || pbs_errno == PBSE_BADHOST)) {
		i = SEND_JOB_NODEDW;
	} else if (should_retry_route(pbs_errno) == -1) {
		i = SEND_JOB_FATAL;
	} else {
		i = SEND_JOB_RETRY;
	}
	(void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno);
	log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE,
		jobp->ji_qs.ji_jobid, log_buffer);
	log_close(0);
	net_close(-1);
	unlink(script_name);
	exit(i);

fatal_exit:
	if (con >= 0)
		svr_disconnect(con);
	log_close(0);
	net_close(-1);
	unlink(script_name);
	exit(SEND_JOB_FATAL);

ok_exit:
	if (con >= 0)
		svr_disconnect(con);
	log_close(0);
	net_close(-1);
	unlink(script_name);
	exit(SEND_JOB_OK);
}
Example #15
0
/**
 * @brief
 * 		Send execution job on connected rpp stream.
 *
 * @param[in]	jobp	-	pointer to the job being sent
 * @param[in]	hostaddr	-	the address of host to send job to, host byte order
 * @param[in]	port	-	the destination port, host byte order
 * @param[in]	request	-	The batch request associated with this send job call
 *
 * @return	int
 * @retval  2	: success
 * @retval  -1	: failure (pbs_errno set to error number)
 *
 */
int
send_job_exec(job *jobp, pbs_net_t hostaddr, int port, struct batch_request *request)
{
	pbs_list_head attrl;
	attribute *pattr;
	mominfo_t *pmom = NULL;
	int stream = -1;
	int encode_type;
	char *destin = jobp->ji_qs.ji_destin;
	int i;
	size_t		 credlen = 0;
	char		*credbuf = NULL;
	char job_id[PBS_MAXSVRJOBID + 1];
	struct attropl *pqjatr; /* list (single) of attropl for quejob */
	int rc;
	int rpp = 1;
	char *jobid = NULL;
	char *script = NULL;
	char *msgid = NULL;
	char *dup_msgid = NULL;
	struct work_task *ptask = NULL;

	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if ((script = svr_load_jobscript(jobp)) == NULL) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to load job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			goto send_err;
		}
	}

	stream = svr_connect(hostaddr, port, NULL, ToServerDIS, rpp);
	if (stream < 0) {
		log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, LOG_WARNING, "", "Could not connect to Mom");
		goto send_err;
	}

	pmom = tfind2((unsigned long) jobp->ji_qs.ji_un.ji_exect.ji_momaddr,
		jobp->ji_qs.ji_un.ji_exect.ji_momport,
		&ipaddrs);
	if (!pmom || (((mom_svrinfo_t *)(pmom->mi_data))->msr_state & INUSE_DOWN))
		goto send_err;

	CLEAR_HEAD(attrl);

	resc_access_perm = ATR_DFLAG_MOM;
	encode_type = ATR_ENCODE_MOM;

	pattr = jobp->ji_wattr;
	for (i = 0; i < (int) JOB_ATR_LAST; i++) {
		if ((job_attr_def + i)->at_flags & resc_access_perm) {
			(void)(job_attr_def + i)->at_encode(pattr + i, &attrl,
				(job_attr_def + i)->at_name, (char *) 0, encode_type,
				NULL);
		}
	}
	attrl_fixlink(&attrl);
	/* save the job id for when after we purge the job */

	/* read any credential file */
	(void)get_credential(pmom->mi_host, jobp, PBS_GC_BATREQ, &credbuf, &credlen);

	(void) strcpy(job_id, jobp->ji_qs.ji_jobid);

	pbs_errno = 0;

	pqjatr = &((svrattrl *) GET_NEXT(attrl))->al_atopl;
	jobid = PBSD_queuejob(stream, jobp->ji_qs.ji_jobid, destin, pqjatr, (char *) 0, rpp, &msgid);
	free_attrlist(&attrl);
	if (jobid == NULL)
		goto send_err;

	rpp_add_close_func(stream, process_DreplyRPP); /* register a close handler */

	/* adding msgid to deferred list, dont free msgid */
	if ((ptask = add_mom_deferred_list(stream, pmom, post_sendmom, msgid, request, jobp)) == NULL)
		goto send_err;

	/* add to pjob->svrtask list so its automatically cleared when job is purged */
	append_link(&jobp->ji_svrtask, &ptask->wt_linkobj, ptask);

	/* we cannot use the same msgid, since it is not part of the preq,
	 * make a dup of it, and we can freely free it
	 */
	if ((dup_msgid = strdup(msgid)) == NULL)
		goto send_err;

	/*
	 * henceforth use the same msgid, since we mean to say all this is
	 * part of a single logical request to the mom
	 * and we will be hanging off one request to be answered to finally
	 */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		if (PBSD_jscript_direct(stream, script, rpp, &dup_msgid) != 0)
			goto send_err;
	}
	free(script);
	script = NULL;

	if (credlen > 0) {
		rc = PBSD_jcred(stream, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, rpp, &dup_msgid);
		if (credbuf)
			free(credbuf);
		if (rc != 0)
			goto send_err;
	}

	if ((jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN)
		&& (hostaddr != pbs_server_addr)) {
		if ((move_job_file(stream, jobp, StdOut, rpp, &dup_msgid) != 0) ||
			(move_job_file(stream, jobp, StdErr, rpp, &dup_msgid) != 0) ||
			(move_job_file(stream, jobp, Chkpt, rpp, &dup_msgid) != 0))
			goto send_err;
	}

	if (PBSD_commit(stream, job_id, rpp, &dup_msgid) != 0)
		goto send_err;

	free(dup_msgid); /* free this as it is not part of any work task */

	return 2;

send_err:
	if (dup_msgid)
		free(dup_msgid);

	if (script)
		free(script);

	if (ptask) {
		if (ptask->wt_event2)
			free(ptask->wt_event2);
		delete_task(ptask);
	}

	sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer);
	return (-1);
}
Example #16
0
int set_node_power_state(
    
  struct pbsnode **ppNode,
  unsigned short   newState)

  {
  struct pbsnode *pNode = *ppNode;
  if (pNode->nd_addrs == NULL)
    {
    return PBSE_BAD_PARAMETER;
    }

  if (newState == POWER_STATE_RUNNING)
    {
    static std::string interface;
    static unsigned char mac_addr[6];
    if (interface.length() == 0)
      {
      if (!getMacAddr(interface,mac_addr))
        {
        return PBSE_SYSTEM;
        }
      }

    int sock;
    if ((sock = socket(AF_INET,SOCK_PACKET,SOCK_PACKET)) < 0)
      {
      return PBSE_SYSTEM;
      }

    unsigned char outpack[1000];

    memcpy(outpack+6,mac_addr,6);
    memcpy(outpack,pNode->nd_mac_addr,6);
    outpack[12] = 0x08;
    outpack[13] = 0x42;
    int offset = 14;
    memset(outpack + offset,0xff,6);
    offset += 6;

    for (int i = 0;i < 16;i++)
      {
      memcpy(outpack + offset,pNode->nd_mac_addr,6);
      offset += 6;
      }

    int one = 1;
    if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, (char *)&one, sizeof(one)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    struct sockaddr whereto;
    whereto.sa_family = 0;
    snprintf(whereto.sa_data, sizeof(whereto.sa_data), "%s", interface.c_str());

    if (sendto(sock, outpack, offset, 0, &whereto, sizeof(whereto)) < 0)
      {
      close(sock);
      return PBSE_SYSTEM;
      }

    close(sock);
    return PBSE_NONE;
    }

  if (pNode->nd_job_usages.size() != 0)
    {
    //Can't change the power state on a node with running jobs.
    return PBSE_CANT_CHANGE_POWER_STATE_WITH_JOBS_RUNNING;
    }
  struct batch_request *request = alloc_br(PBS_BATCH_ChangePowerState);
  if (request == NULL)
    {
    return PBSE_SYSTEM;
    }

  request->rq_ind.rq_powerstate = newState;
  pNode->nd_power_state_change_time = time(NULL);

  snprintf(request->rq_host, sizeof(request->rq_host), "%s", pNode->nd_name);
  std::string hostname(request->rq_host);
  int rc = PBSE_NONE;

  {
    int handle = 0;
    int local_errno = 0;
    handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
    if(handle < 0)
      {
      unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
      *ppNode = NULL;
      return local_errno;
      }
    unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
    *ppNode = NULL;
    rc = issue_Drequest(handle, request,true);
    if(rc == PBSE_NONE)
      {
      rc = request->rq_reply.brp_code;
      if(rc < 0) rc = -rc;
      }
  }
  pNode = find_nodebyname(hostname.c_str());
  *ppNode = pNode;
  if ((rc == PBSE_NONE)&&(pNode != NULL))
    {
    pNode->nd_power_state = newState;
    }

  return(rc);
  }
Example #17
0
/**
 *
 * @brief
 * 		Send a job over the network to some other server or MOM.
 * @par
 * 		Under Linux/Unix, this starts a child process to do the work.
 *		Connect to the destination host and port,
 * 		and go through the protocol to transfer the job.
 * 		Signals are blocked.
 *
 * @param[in]	jobp	-	pointer to the job being sent.
 * @param[in]	hostaddr	-	the address of host to send job to, host byte order.
 * @param[in]	port	-	the destination port, host byte order
 * @param[in]	move_type	-	the type of move (e.g. MOVE_TYPE_exec)
 * @param[in]	post_func	-	the function to execute once the child process
 *								sending job completes (Linux/Unix only)
 * @param[in]	data	-	input data to 'post_func'
 *
 * @return	int
 * @retval	2	parent	: success (child forked)
 * @retval	-1	parent	: on failure (pbs_errno set to error number)
 * @retval	SEND_JOB_OK	child	: 0 success, job sent
 * @retval	SEND_JOB_FATAL	child	: 1 permenent failure or rejection,
 * @retval	SEND_JOB_RETRY	child	: 2 failed but try again
 * @retval	SEND_JOB_NODEDW child	: 3 execution node down, retry different node
 */
int
send_job(job *jobp, pbs_net_t hostaddr, int port, int move_type,
	void (*post_func)(struct work_task *), struct batch_request *preq)
{

#ifdef WIN32
	char	cmdline[80];
	pio_handles	pio;
	char	buf[4096];
	struct work_task *ptask;
	int	newstate;
	int	newsub;
	long	tempval;
	char	script_name[MAXPATHLEN+1];
	int 		gridproxy_cred = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	sprintf(cmdline, "%s/sbin/pbs_send_job", pbs_conf.pbs_exec_path);

	if (win_popen(cmdline, "w", &pio, NULL) == 0) {
		errno = GetLastError();
		pbs_errno = errno;
		(void)sprintf(log_buffer, "executing %s for job %s failed errno=%d", cmdline, jobp->ji_qs.ji_jobid, errno);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_ERR,
			jobp->ji_qs.ji_jobid, log_buffer);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);

		win_pclose(&pio);
		return (-1);
	}

	ptask = set_task(WORK_Deferred_Child, (long)pio.pi.hProcess, post_func, preq);
	if (!ptask) {
		log_err(errno, __func__, msg_err_malloc);
		errno = ENOMEM;
		pbs_errno = errno;
		win_pclose(&pio);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);
		return (-1);
	} else {
		ptask->wt_parm2 = jobp;
		append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask);
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, &script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			win_pclose2(&pio);
			return (-1);
		}
	}

	addpid(pio.pi.hProcess);

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	/* in windows code, a child process "w32_send_job" handles the send
	 * This needs the job information, so we save using the filesystem
	 * This avoids the child process from having to "connect" to the database again
	 * The file is deleted by the send_job child process when it has done recovering the job
	 */
	job_save_fs(jobp, SAVEJOB_FULLFORCE);	/* so the spawned process can get a fresh copy of job */

	if (*jobp->ji_qs.ji_fileprefix != '\0')
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX);
	else
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_jobid, JOB_FILE_SUFFIX);

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destaddr=%ld\n", hostaddr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destport=%d\n", port);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "move_type=%d\n", move_type);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "in_server=%d\n", is_linked(&svr_alljobs, &jobp->ji_alljobs));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_name=%s\n", (server_name?server_name:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_host=%s\n", (server_host?server_host:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_addr=%ld\n", pbs_server_addr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_port=%d\n", pbs_server_port_dis);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "log_file=%s\n", (log_file?log_file:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_log=%s\n", (path_log?path_log:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_jobs=%s\n", (path_jobs?path_jobs:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_spool=%s\n", (path_spool?path_spool:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_rescdef=%s\n", (path_rescdef?path_rescdef:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_users=%s\n", (path_users?path_users:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_hooks_workdir=%s\n",
		(path_hooks_workdir?path_hooks_workdir:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_enable=%ld\n", svr_history_enable);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_duration=%ld\n", svr_history_duration);
	win_pwrite(&pio, buf, strlen(buf));

	if ( (server.sv_attr[SRV_ATR_ssignon_enable].at_flags & \
                                                ATR_VFLAG_SET) && \
             (server.sv_attr[SRV_ATR_ssignon_enable].at_val.at_long == 1) )
		strcpy(buf, "single_signon_password_enable=1\n");
	else
		strcpy(buf, "single_signon_password_enable=0\n");

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "script_name=%s\n", script_name);
	win_pwrite(&pio, buf, strlen(buf));

	strcpy(buf, "quit\n");
	win_pwrite(&pio, buf, strlen(buf));
	win_pclose2(&pio);	/* closes all handles except the process handle */
	return (2);
#else
	pbs_list_head	 attrl;
	enum conn_type   cntype = ToServerDIS;
	int		 con;
	char		*credbuf = NULL;
	size_t		 credlen = 0;
	char		*destin = jobp->ji_qs.ji_destin;
	int		 encode_type;
	int		 i;
	char		 job_id[PBS_MAXSVRJOBID+1];
	attribute	*pattr;
	pid_t		 pid;
	struct attropl  *pqjatr;      /* list (single) of attropl for quejob */
	char		 script_name[MAXPATHLEN+1];
	struct work_task *ptask;
	struct  hostent *hp;
	struct in_addr   addr;
	long		 tempval;
	int 		gridproxy_cred = 0;
	int 		rpp = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			return -1;
		}
	}

	pid = fork();
	if (pid == -1) {	/* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		pbs_errno = PBSE_SYSTEM;
		return -1;
	}

	if (pid != 0) {		/* The parent (main server) */

		ptask = set_task(WORK_Deferred_Child, pid, post_func, preq);
		if (!ptask) {
			log_err(errno, __func__, msg_err_malloc);
			return (-1);
		} else {
			ptask->wt_parm2 = jobp;
			append_link(&((job *)jobp)->ji_svrtask,
				&ptask->wt_linkobj, ptask);
		}
		return 2;
	}

	/*
	 * the child process
	 *
	 * set up signal cather for error return
	 */
	DBPRT(("%s: child started, sending to port %d\n", __func__, port))
	rpp_terminate();

	/* Unprotect child from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_OFF);

#ifdef WIN32
	/* get host name */
	/*
	 * If host address is loopback address then do not resolve with dns
	 * Use "localhost" as the host name.
	 */
	if ((htonl(hostaddr) == loopback_addr->sin_addr.s_addr)) {
		(void)get_credential(LOCALHOST_SHORTNAME, jobp, PBS_GC_BATREQ,
			&credbuf, &credlen);
	} else {
#endif
		addr.s_addr = htonl(hostaddr);
		hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET);
		if (hp == NULL) {
			sprintf(log_buffer, "%s: h_errno=%d",
				inet_ntoa(addr), h_errno);
			log_err(-1, __func__, log_buffer);
		} else {
			/* read any credential file */
			(void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ,
				&credbuf, &credlen);
		}
#ifdef WIN32
	}
#endif

	/* encode job attributes to be moved */

	CLEAR_HEAD(attrl);

	/* select attributes/resources to send based on move type */

	if (move_type == MOVE_TYPE_Exec) {
		resc_access_perm = ATR_DFLAG_MOM;
		encode_type = ATR_ENCODE_MOM;
		cntype = ToServerDIS;
	} else {
		resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR |
			ATR_DFLAG_MGWR | ATR_DFLAG_SvRD;
		encode_type = ATR_ENCODE_SVR;
		svr_dequejob(jobp);	/* clears default resource settings */
	}

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	pattr = jobp->ji_wattr;
	for (i=0; i < (int)JOB_ATR_LAST; i++) {
		if ((job_attr_def+i)->at_flags & resc_access_perm) {
			(void)(job_attr_def+i)->at_encode(pattr+i, &attrl,
				(job_attr_def+i)->at_name, (char *)0,
				encode_type, NULL);
		}
	}
	attrl_fixlink(&attrl);


	/* save the job id for when after we purge the job */

	(void)strcpy(job_id, jobp->ji_qs.ji_jobid);

	pbs_errno = 0;
	con = -1;

	for (i=0; i<RETRY; i++) {

		/* connect to receiving server with retries */

		if (i > 0) {	/* recycle after an error */
			if (con >= 0)
				svr_disconnect(con);
			if (should_retry_route(pbs_errno) == -1) {
				/* delete the temp script file */
				unlink(script_name);
				exit(SEND_JOB_FATAL);	/* fatal error, don't retry */
			}
			sleep(1<<i);
		}
		if ((con = svr_connect(hostaddr, port, 0, cntype, rpp)) ==
			PBS_NET_RC_FATAL) {
			(void)sprintf(log_buffer, "send_job failed to %lx port %d",
				hostaddr, port);
			log_err(pbs_errno, __func__, log_buffer);

			/* delete the temp script file */
			unlink(script_name);

			if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == PBSE_BADCRED))
				exit(SEND_JOB_NODEDW);

			exit(SEND_JOB_FATAL);
		} else if (con == PBS_NET_RC_RETRY) {
			pbs_errno = ECONNREFUSED;	/* should retry */
			continue;
		}

		/*
		 * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
		 * we are recovering after being down or a late failure, we
		 * just want to send the commit"
		 */

		if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) {

			if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) {
				jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
			}

			pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;
			if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin,
				pqjatr, (char *)0, rpp, NULL) == 0) {
				if (pbs_errno == PBSE_JOBEXIST &&
					move_type == MOVE_TYPE_Exec) {
					/* already running, mark it so */
					log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						"Mom reports job already running");
					exit(SEND_JOB_OK);
				}
				else if ((pbs_errno == PBSE_HOOKERROR) ||
					(pbs_errno == PBSE_HOOK_REJECT)  ||
					(pbs_errno == PBSE_HOOK_REJECT_RERUNJOB)  ||
					(pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) {
					char		name_buf[MAXPATHLEN+1];
					int		rfd;
					int		len;
					char		*reject_msg;
					int		err;

					err = pbs_errno;

					reject_msg = pbs_geterrmsg(con);
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d reject_msg=%s",
						destin, err,
						reject_msg?reject_msg:"");
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);

					(void)strcpy(name_buf, path_hooks_workdir);
					(void)strcat(name_buf, jobp->ji_qs.ji_jobid);
					(void)strcat(name_buf, HOOK_REJECT_SUFFIX);

					if ((reject_msg != NULL) &&
						(reject_msg[0] != '\0')) {

						if ((rfd = open(name_buf,
							O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) {
							sprintf(log_buffer,
								"open of reject file %s failed: errno %d",
								name_buf, errno);
							log_event(PBSEVENT_JOB,
								PBS_EVENTCLASS_JOB,
								LOG_INFO, jobp->ji_qs.ji_jobid,
								log_buffer);
						} else {
#ifdef WIN32
							secure_file(name_buf, "Administrators",
								READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
							setmode(rfd, O_BINARY);
#endif
							len = strlen(reject_msg)+1;
							/* write also trailing null char */
							if (write(rfd, reject_msg, len) != len) {
								sprintf(log_buffer,
									"write to file %s incomplete: errno %d", name_buf, errno);
								log_event(PBSEVENT_JOB,
									PBS_EVENTCLASS_JOB,
									LOG_INFO, jobp->ji_qs.ji_jobid,
									log_buffer);
							}
							close(rfd);
						}
					}

					if (err == PBSE_HOOKERROR)
						exit(SEND_JOB_HOOKERR);
					if (err == PBSE_HOOK_REJECT)
						exit(SEND_JOB_HOOK_REJECT);
					if (err == PBSE_HOOK_REJECT_RERUNJOB)
						exit(SEND_JOB_HOOK_REJECT_RERUNJOB);
					if (err == PBSE_HOOK_REJECT_DELETEJOB)
						exit(SEND_JOB_HOOK_REJECT_DELETEJOB);
				}
				else {
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d",
						destin, pbs_errno);
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);
					continue;
				}
			}

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				if (PBSD_jscript(con, script_name, rpp, NULL) != 0)
					continue;
			}

			if (credlen > 0) {
				int	ret;

				ret = PBSD_jcred(con,
					jobp->ji_extended.ji_ext.ji_credtype,
					credbuf, credlen, rpp, NULL);
				if ((ret == 0) || (i == (RETRY - 1)))
					free(credbuf);	/* free credbuf if cred info is sent successfully OR */
				/* at the end of all retry attempts */
				if (ret != 0)
					continue;
			}

			if ((move_type == MOVE_TYPE_Exec) &&
				(jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
				(hostaddr !=  pbs_server_addr)) {
				/* send files created on prior run */
				if ((move_job_file(con, jobp, StdOut, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, StdErr, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, Chkpt, rpp, NULL) != 0))
					continue;
			}

			jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;
		}

		if (PBSD_rdytocmt(con, job_id, rpp, NULL) != 0)
			continue;

		if (PBSD_commit(con, job_id, rpp, NULL) != 0) {
			/* delete the temp script file */
			unlink(script_name);
			exit(SEND_JOB_FATAL);
		}
		svr_disconnect(con);

		/* delete the temp script file */
		unlink(script_name);

		exit(SEND_JOB_OK);	/* This child process is all done */
	}
	if (con >= 0)
		svr_disconnect(con);
	/*
	 * If connection is actively refused by the execution node(or mother superior) OR
	 * the execution node(or mother superior) is rejecting request with error
	 * PBSE_BADHOST(failing to authorize server host), the node should be marked down.
	 */
	if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == ECONNREFUSED  || pbs_errno == PBSE_BADHOST)) {
		i = SEND_JOB_NODEDW;
	} else if (should_retry_route(pbs_errno) == -1) {
		i = SEND_JOB_FATAL;
	} else {
		i = SEND_JOB_RETRY;
	}
	(void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno);
	log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE,
		jobp->ji_qs.ji_jobid, log_buffer);

	/* delete the temp script file */
	unlink(script_name);

	exit(i);
	return -1;		/* NOT REACHED */

#endif /* !WIN32 */
}
Example #18
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int handle = -1;
  unsigned long job_momaddr = -1;
  unsigned short job_momport = -1;
  char *job_momname = NULL;
  job *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return PBSE_JOBNOTFOUND;

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    strcpy(newrq->rq_ind.rq_status.rq_id, job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if (node->nd_state & INUSE_DOWN)
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL, ToServerDIS);

  /* Unlock job here */
  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return rc;
  }  /* END stat_to_mom() */
Example #19
0
int issue_to_svr(

    const char            *servern,                  /* I */
    struct batch_request **preq_ptr,                 /* I */
    void (*replyfunc)      (struct work_task *))     /* I */

{
    int             rc = PBSE_NONE;
    bool            do_retry = false;
    int             handle;
    int             my_err = 0;
    pbs_net_t       svraddr;
    char           *svrname;
    unsigned int    port = pbs_server_port_dis;
    batch_request  *preq = *preq_ptr;

    snprintf(preq->rq_host, sizeof(preq->rq_host), "%s", servern);

    preq->rq_fromsvr = 1;
    preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

    svrname = parse_servername(servern, &port);
    svraddr = get_hostaddr(&my_err,svrname);

    free(svrname);

    if (svraddr == (pbs_net_t)0)
    {
        if (my_err == PBS_NET_RC_RETRY)
        {
            /* Non fatal error - retry */

            do_retry = true;
        }
    }
    else
    {
        handle = svr_connect(svraddr, port, &my_err, NULL, NULL);

        if (handle >= 0)
        {
            if (((rc = issue_Drequest(handle, preq, true)) == PBSE_NONE) &&
                    (handle != PBS_LOCAL_CONNECTION))
            {
                /* preq is already freed if handle == PBS_LOCAL_CONNECTION - a reply
                 * has always been sent */
                rc = preq->rq_reply.brp_code;
            }
            else if (handle == PBS_LOCAL_CONNECTION)
                *preq_ptr = NULL;

            return(rc);
        }
        else if (handle == PBS_NET_RC_RETRY)
            do_retry = true;
    }

    /* if reached here, it didn`t go, do we retry? */

    if (do_retry)
    {
        queue_a_retry_task(preq, replyfunc);

        return(PBSE_NONE);
    }

    /* FAILURE */

    return(PBSE_INTERNAL);
}  /* END issue_to_svr() */
Example #20
0
void req_gpuctrl(

  struct batch_request *preq)

  {
  char   *id = "req_gpuctrl";

  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
#ifdef NVIDIA_GPUS
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    rc = 0;
  int    conn;
#endif  /* NVIDIA_GPUS */

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    req_reject(PBSE_PERM, 0, preq, NULL, NULL);
    return;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

#ifdef NVIDIA_GPUS

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buffer,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, id, log_buffer, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return;
    }

  /* validate that the node is up */

  if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))
    {
    sprintf(
      log_buffer,
      "Node %s is not available",
      pnode->nd_name);
    req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer);
    return;
    }


  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus");
    return;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node");
    return;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified");
    return;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3");
    return;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           process_Dreply,
           ToServerDIS);

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

#else

    sprintf(
      log_buffer,
      "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

  if (LOGLEVEL >= 3)
    {
      log_ext(-1, id, log_buffer, LOG_INFO);
    }

  req_reject(PBSE_NOSUP, 0, preq, NULL, NULL);

#endif  /* NVIDIA_GPUS */

  return;
  }
Example #21
0
int send_job_over_network_with_retries(
    
  char           *job_id,
  char           *job_destin,
  tlist_head     &attrl,
  bool           &attempt_to_queue_job,
  bool           &change_substate_on_attempt_to_queue,
  bool           &timeout,
  const char     *script_name,
  bool            need_to_send_job_script,
  bool            job_has_run,
  unsigned long   job_momaddr,
  unsigned short  job_momport,
  char           *stdout_path,
  char           *stderr_path,
  char           *chkpt_path,
  int             type,
  int            *my_err,
  int            *mom_err)

  {
  int  con = PBS_NET_RC_UNSET;
  char log_buf[LOCAL_LOG_BUF_SIZE];
  int  rc = LOCUTION_RETRY;

  for (int NumRetries = 0; NumRetries < RETRY; NumRetries++)
    {
    /* connect to receiving server with retries */
    if (NumRetries > 0)
      {
      /* recycle after an error */
      if (con >= 0)
        {
        svr_disconnect(con);
        con = PBS_NET_RC_UNSET;
        }

      /* check my_err from previous attempt */
      if ((should_retry_route(*my_err) == -1) ||
          (should_retry_route(*mom_err) == -1))
        {
        sprintf(log_buf, "child failed in previous commit request for job %s", job_id);

        log_err(*my_err, __func__, log_buf);
        break;
        }

      sleep(1 << NumRetries);
      }

    /* make sure this is zero at the point that we're retrying */
    *my_err = 0;

    if ((con = svr_connect(job_momaddr, job_momport, my_err, NULL, NULL)) == PBS_NET_RC_FATAL)
      {
      sprintf(log_buf, "send_job failed to host %s, %lx port %d",
        (job_destin[0] != '\0') ? job_destin : "unknown host",
        job_momaddr,
        job_momport);

      log_err(*my_err, __func__, log_buf);
      rc = LOCUTION_FAIL;

      break;
      }

    if (con == PBS_NET_RC_RETRY)
      {
      *my_err = 0; /* should retry */

      continue;
      }

    if (con == PBS_LOCAL_CONNECTION)
      {
      log_err(-1, __func__, "attempting to run the job on pbs_server???");
      return(PBSE_SYSTEM);
      }

    rc = send_job_over_network(job_id, 
                               con,
                               job_destin,
                               attrl,
                               attempt_to_queue_job,
                               change_substate_on_attempt_to_queue,
                               timeout,
                               script_name,
                               need_to_send_job_script,
                               job_has_run,
                               job_momaddr,
                               stdout_path,
                               stderr_path,
                               chkpt_path,
                               type,
                               my_err,
                               mom_err);

    if (rc == LOCUTION_SUCCESS)
      break;
    }  /* END for (NumRetries) */
  
  if (con >= 0)
    svr_disconnect(con);

  return(rc);
  } /* END send_job_over_network_with_retries() */
Example #22
0
int send_job(

  job       *jobp,
  pbs_net_t  hostaddr, /* host address, host byte order */
  int        port, /* service port, host byte order */
  int        move_type, /* move, route, or execute */
  void (*post_func)(struct work_task *),     /* after move */
  void      *data)  /* ptr to optional batch_request to be put */
                    /* in the work task structure */

  {
  tlist_head  attrl;
  enum conn_type cntype = ToServerDIS;
  int    con;
  char  *destin = jobp->ji_qs.ji_destin;
  int    encode_type;
  int    i;
  int    NumRetries;

  char  *id = "send_job";

  attribute *pattr;

  pid_t  pid;

  struct attropl *pqjatr;      /* list (single) of attropl for quejob */
  char  *safail = "sigaction failed\n";
  char  *spfail = "sigprocmask failed\n";
  char   script_name[MAXPATHLEN + 1];
  sigset_t  child_set, all_set;

  struct  sigaction child_action;

  struct work_task *ptask;

  mbool_t        Timeout = FALSE;

  char          *pc;

  sigemptyset(&child_set);
  sigaddset(&child_set, SIGCHLD);
  sigfillset(&all_set);

  /* block SIGCHLD until work task is established */

  if (sigprocmask(SIG_BLOCK, &child_set, NULL) == -1)
    {
    log_err(errno,id,spfail);

    pbs_errno = PBSE_SYSTEM;

    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "cannot set signal mask");

    return(ROUTE_PERM_FAILURE);
    }

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"about to send job - type=%d",
      move_type);
 
    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      jobp->ji_qs.ji_jobid,
      "forking in send_job");
    }

  pid = fork();

  if (pid == -1)
    {
    /* error on fork */

    log_err(errno, id, "fork failed\n");

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    pbs_errno = PBSE_SYSTEM;

    return(ROUTE_PERM_FAILURE);
    }

  if (pid != 0)
    {
    /* The parent (main server) */

    /* create task to monitor job startup */

    /* CRI:   need way to report to scheduler job is starting, not started */

    ptask = set_task(WORK_Deferred_Child, pid, post_func, jobp);

    if (ptask == NULL)
      {
      log_err(errno, id, msg_err_malloc);

      return(ROUTE_PERM_FAILURE);
      }

    ptask->wt_parm2 = data;

    append_link(
      &((job *)jobp)->ji_svrtask,
      &ptask->wt_linkobj,
      ptask);

    /* now can unblock SIGCHLD */

    if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
      log_err(errno, id, spfail);

    if (LOGLEVEL >= 1)
      {
      extern long   DispatchTime[];
      extern job   *DispatchJob[];
      extern char  *DispatchNode[];

      extern time_t time_now;

      struct pbsnode *NP;

      /* record job dispatch time */

      int jindex;

      for (jindex = 0;jindex < 20;jindex++)
        {
        if (DispatchJob[jindex] == NULL)
          {
          DispatchTime[jindex] = time_now;

          DispatchJob[jindex] = jobp;

          if ((NP = PGetNodeFromAddr(hostaddr)) != NULL)
            DispatchNode[jindex] = NP->nd_name;
          else
            DispatchNode[jindex] = NULL;

          break;
          }
        }
      }

    /* SUCCESS */

    return(ROUTE_DEFERRED);
    }  /* END if (pid != 0) */

  /*
   * the child process
   *
   * set up signal catcher for error return
   */

  rpp_terminate();

  child_action.sa_handler = net_move_die;

  sigfillset(&child_action.sa_mask);

  child_action.sa_flags = 0;

  if (sigaction(SIGHUP, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGINT, &child_action, NULL))
    log_err(errno, id, safail);

  if (sigaction(SIGQUIT, &child_action, NULL))
    log_err(errno, id, safail);

  /* signal handling is set, now unblock */

  if (sigprocmask(SIG_UNBLOCK, &child_set, NULL) == -1)
    log_err(errno, id, spfail);

  /* encode job attributes to be moved */

  CLEAR_HEAD(attrl);

  /* select attributes/resources to send based on move type */

  if (move_type == MOVE_TYPE_Exec)
    {
    /* moving job to MOM - ie job start */

    resc_access_perm = ATR_DFLAG_MOM;
    encode_type = ATR_ENCODE_MOM;
    cntype = ToServerDIS;
    }
  else
    {
    /* moving job to alternate server? */

    resc_access_perm =
      ATR_DFLAG_USWR |
      ATR_DFLAG_OPWR |
      ATR_DFLAG_MGWR |
      ATR_DFLAG_SvRD;

    encode_type = ATR_ENCODE_SVR;

    /* clear default resource settings */

    svr_dequejob(jobp);
    }

  pattr = jobp->ji_wattr;

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    if (((job_attr_def + i)->at_flags & resc_access_perm) ||
      ((strncmp((job_attr_def + i)->at_name,"session_id",10) == 0) &&
      (jobp->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)))
      {
      (job_attr_def + i)->at_encode(
        pattr + i,
        &attrl,
        (job_attr_def + i)->at_name,
        NULL,
        encode_type);
      }
    }    /* END for (i) */

  attrl_fixlink(&attrl);

  /* put together the job script file name */

  strcpy(script_name, path_jobs);

  if (jobp->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    strcat(script_name, jobp->ji_arraystruct->ai_qs.fileprefix);
    }
  else
    {
    strcat(script_name, jobp->ji_qs.ji_fileprefix);
    }

  strcat(script_name, JOB_SCRIPT_SUFFIX);


  pbs_errno = 0;
  con = -1;

  for (NumRetries = 0;NumRetries < RETRY;NumRetries++)
    {
    int rc;

    /* connect to receiving server with retries */

    if (NumRetries > 0)
      {
      /* recycle after an error */

      if (con >= 0)
        svr_disconnect(con);

      /* check pbs_errno from previous attempt */

      if (should_retry_route(pbs_errno) == -1)
        {
        sprintf(log_buffer, "child failed in previous commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_err(pbs_errno, id, log_buffer);

        exit(1); /* fatal error, don't retry */
        }

      sleep(1 << NumRetries);
      }

    /* NOTE:  on node hangs, svr_connect is successful */

    if ((con = svr_connect(hostaddr, port, 0, cntype)) == PBS_NET_RC_FATAL)
      {
      sprintf(log_buffer, "send_job failed to %lx port %d",
        hostaddr,
        port);

      log_err(pbs_errno, id, log_buffer);

      exit(1);
      }

    if (con == PBS_NET_RC_RETRY)
      {
      pbs_errno = 0; /* should retry */

      continue;
      }

    /*
     * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
     * we are recovering after being down or a late failure, we
     * just want to send the "ready-to-commit/commit"
     */

    if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM)
      {
      if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT)
        {
        jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;

        job_save(jobp, SAVEJOB_QUICK);
        }

      pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;

      if ((pc = PBSD_queuejob(
                  con,
                  jobp->ji_qs.ji_jobid,
                  destin,
                  pqjatr,
                  NULL)) == NULL)
        {
        if ((pbs_errno == PBSE_EXPIRED) || (pbs_errno == PBSE_READ_REPLY_TIMEOUT))
          {
          /* queue job timeout based on pbs_tcp_timeout */

          Timeout = TRUE;
          }

        if ((pbs_errno == PBSE_JOBEXIST) && (move_type == MOVE_TYPE_Exec))
          {
          /* already running, mark it so */

          log_event(
            PBSEVENT_ERROR,
            PBS_EVENTCLASS_JOB,
            jobp->ji_qs.ji_jobid,
            "MOM reports job already running");

          exit(0);
          }

        sprintf(log_buffer, "send of job to %s failed error = %d",
          destin,
          pbs_errno);

        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          jobp->ji_qs.ji_jobid,
          log_buffer);

        continue;
        }  /* END if ((pc = PBSD_queuejob() == NULL) */

      free(pc);

      if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)
        {
        if (PBSD_jscript(con, script_name, jobp->ji_qs.ji_jobid) != 0)
          continue;
        }

      /* XXX may need to change the logic below, if we are sending the job to
         a mom on the same host and the mom and server are not sharing the same
         spool directory, then we still need to move the file */

      if ((move_type == MOVE_TYPE_Exec) &&
          (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
          (hostaddr != pbs_server_addr))
        {
        /* send files created on prior run */

        if ((move_job_file(con,jobp,StdOut) != 0) ||
            (move_job_file(con,jobp,StdErr) != 0) ||
            (move_job_file(con,jobp,Checkpoint) != 0))
          {
          continue;
          }
        }

      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;

      job_save(jobp, SAVEJOB_QUICK);
      }
    else
      {
      /* ignore signals */

      if (sigprocmask(SIG_BLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");
      }

    if (PBSD_rdytocmt(con, jobp->ji_qs.ji_jobid) != 0)
      {
      if (sigprocmask(SIG_UNBLOCK, &all_set, NULL) == -1)
        log_err(errno, id, "sigprocmask\n");

      continue;
      }


    if ((rc = PBSD_commit(con, jobp->ji_qs.ji_jobid)) != 0)
      {
      int errno2;

      /* NOTE:  errno is modified by log_err */

      errno2 = errno;

      sprintf(log_buffer, "send_job commit failed, rc=%d (%s)",
              rc,
              (connection[con].ch_errtxt != NULL) ? connection[con].ch_errtxt : "N/A");

      log_ext(errno2, id, log_buffer, LOG_WARNING);

      /* if failure occurs, pbs_mom should purge job and pbs_server should set *
         job state to idle w/error msg */

      if (errno2 == EINPROGRESS)
        {
        /* request is still being processed */

        /* increase tcp_timeout in qmgr? */

        Timeout = TRUE;

        /* do we need a continue here? */

        sprintf(log_buffer, "child commit request timed-out for job %s, increase tcp_timeout?",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_WARNING);

        /* don't retry on timeout--break out and report error! */

        break;
        }
      else
        {
        sprintf(log_buffer, "child failed in commit request for job %s",
                jobp->ji_qs.ji_jobid);

        log_ext(errno2, id, log_buffer, LOG_CRIT);

        /* FAILURE */

        exit(1);
        }
      }    /* END if ((rc = PBSD_commit(con,jobp->ji_qs.ji_jobid)) != 0) */

    svr_disconnect(con);

    /* child process is done */

    /* SUCCESS */

    exit(0);
    }  /* END for (NumRetries) */

  if (con >= 0)
    svr_disconnect(con);

  if (Timeout == TRUE)
    {
    /* 10 indicates that job migrate timed out, server will mark node down *
          and abort the job - see post_sendmom() */

    sprintf(log_buffer, "child timed-out attempting to start job %s",
            jobp->ji_qs.ji_jobid);

    log_ext(pbs_errno, id, log_buffer, LOG_WARNING);

    exit(10);
    }

  if (should_retry_route(pbs_errno) == -1)
    {
    sprintf(log_buffer, "child failed and will not retry job %s",
      jobp->ji_qs.ji_jobid);

    log_err(pbs_errno, id, log_buffer);

    exit(1);
    }

  exit(2);

  /*NOTREACHED*/

  return(ROUTE_SUCCESS);
  }  /* END send_job() */
Example #23
0
int relay_to_mom(

  job *pjob,
  struct batch_request  *request, /* the request to send */
  void (*func)(struct work_task *))

  {
  char *id = "relay_to_mom";

  int conn; /* a client style connection handle */
  int   rc;
  pbs_net_t addr;

  struct pbsnode *node;

  /* if MOM is down don't try to connect */

  addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;

  node = tfind_addr(addr,pjob->ji_qs.ji_un.ji_exect.ji_momport,pjob);

  if ((node != NULL) &&
      (node->nd_state & (INUSE_DELETED|INUSE_DOWN)))
    {
    return(PBSE_NORELYMOM);
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buffer, "momaddr=%s",
            netaddr_pbs_net_t(pjob->ji_qs.ji_un.ji_exect.ji_momaddr));

    log_record(
      PBSEVENT_SCHED,
      PBS_EVENTCLASS_REQUEST,
      id,
      log_buffer);
    }

  conn = svr_connect(

           pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
           pjob->ji_qs.ji_un.ji_exect.ji_momport,
           process_Dreply,
           ToServerDIS);

  if (conn < 0)
    {
    LOG_EVENT(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_REQUEST,
      "",
      msg_norelytomom);

    return(PBSE_NORELYMOM);
    }

  request->rq_orgconn = request->rq_conn; /* save client socket */

  rc = issue_Drequest(conn, request, func, NULL);

  return(rc);
  }  /* END relay_to_mom() */