示例#1
0
void post_job_delete_nanny(

  batch_request *preq_sig)

  {
  int                   rc;
  job                  *pjob;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  long                  nanny = 0;

  if (preq_sig == NULL)    
    return;

  rc       = preq_sig->rq_reply.brp_code;

  get_svr_attr_l(SRV_ATR_JobNanny, &nanny);
  if (!nanny)
    {
    /* the admin disabled nanny within the last minute or so */
    free_br(preq_sig);

    return;
    }

  /* extract job id from task */
  pjob = svr_find_job(preq_sig->rq_ind.rq_signal.rq_jid, FALSE);

  if (pjob == NULL)
    {
    sprintf(log_buf, "job delete nanny: the job disappeared (this is a BUG!)");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);
    }
  else if (rc == PBSE_UNKJOBID)
    {
    sprintf(log_buf, "job delete nanny returned, but does not exist on mom");

    log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,preq_sig->rq_ind.rq_signal.rq_jid,log_buf);

    free_nodes(pjob);

    set_resc_assigned(pjob, DECR);
  
    free_br(preq_sig);

    svr_job_purge(pjob);

    return;
    }
  
  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  /* free task */
  free_br(preq_sig);

  return;
  } /* END post_job_delete_nanny() */
示例#2
0
void post_modify_arrayreq(

  batch_request *preq)

  {
  job           *pjob;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  preq->rq_conn = preq->rq_orgconn;  /* restore socket to client */

  if ((preq->rq_reply.brp_code) && (preq->rq_reply.brp_code != PBSE_UNKJOBID))
    {
    sprintf(log_buf, msg_mombadmodify, preq->rq_reply.brp_code);

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_modify.rq_objname,log_buf);

    free_br(preq);
    }
  else
    {
    if (preq->rq_reply.brp_code == PBSE_UNKJOBID)
      {
      if ((pjob = svr_find_job(preq->rq_ind.rq_modify.rq_objname, FALSE)) == NULL)
        {
        free_br(preq);
        return;
        }
      else
        {
        mutex_mgr job_mutex = mutex_mgr(pjob->ji_mutex, true);

        if (LOGLEVEL >= 0)
          {
          sprintf(log_buf, "post_modify_req: PBSE_UNKJOBID for job %s in state %s-%s, dest = %s",
            pjob->ji_qs.ji_jobid,
            PJobState[pjob->ji_qs.ji_state],
            PJobSubState[pjob->ji_qs.ji_substate],
            pjob->ji_qs.ji_destin);

          log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
          }
        }
      }

    free_br(preq);
    }

  return;
  }  /* END post_modify_arrayreq() */
示例#3
0
int reply_send_async(struct batch_request *request)
  {
  int      sfds = request->rq_conn;  /* socket */

  // only thread client responses
  if (svr_conn[sfds].cn_active != FromClientDIS)
    return reply_send(request);

  /* determine where the reply should go, remote or local */
  if (sfds == PBS_LOCAL_CONNECTION)
    // default to synchronous version
    return reply_send(request);
  else if (sfds >= 0)
    {
    int rc = dis_reply_write_async(sfds, &request->rq_reply);
    if ((request->rq_type != PBS_BATCH_AsyModifyJob) || (request->rq_noreply == TRUE))
      {
      free_br(request);
      }

    return rc;
    }
    /* Otherwise, the reply is to be sent to a remote client */


  return 0;
  }
示例#4
0
void post_rerun(

  batch_request *preq)

  {
  int   newstate;
  int   newsub;
  job  *pjob;

  char  log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq == NULL)
    return;

  if (preq->rq_reply.brp_code != 0)
    {
    sprintf(log_buf, "rerun signal reject by mom: %d", preq->rq_reply.brp_code);

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,preq->rq_ind.rq_signal.rq_jid,log_buf);

    if ((pjob = svr_find_job(preq->rq_ind.rq_signal.rq_jid, FALSE)))
      {
      mutex_mgr job_mutex(pjob->ji_mutex, true);
      
      svr_evaljobstate(pjob, &newstate, &newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);
      }
    }

  free_br(preq);

  return;
  }  /* END post_rerun() */
示例#5
0
void chkpt_xfr_hold(

  batch_request *preq,
  job           *pjob)

  {
  char   log_buf[LOCAL_LOG_BUF_SIZE];

  if ((preq == NULL) ||
      (preq->rq_extra == NULL) ||
      (pjob == NULL))
    return;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  free_br(preq);

  set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);

  return;
  }  /* END chkpt_xfr_hold() */
示例#6
0
void chkpt_xfr_done(

  batch_request *preq)

  {
  free_br(preq);
  }  /* END chkpt_xfr_done() */
示例#7
0
void
release_req(struct work_task *pwt)
{
	free_br((struct batch_request *)pwt->wt_parm1);
	if (pwt->wt_event != -1 && pwt->wt_aux2 != 1) /* not rpp */
		svr_disconnect(pwt->wt_event);
}
示例#8
0
void issue_track(
    
  job *pjob)

  {

  struct batch_request   *preq;
  char         *pc;

  preq = alloc_br(PBS_BATCH_TrackJob);

  if (preq == (struct batch_request *)0)
    return;

  preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long;

  strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid);
  strcpy(preq->rq_ind.rq_track.rq_location, server_name);

  preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char;

  pc = pjob->ji_qs.ji_jobid;

  while (*pc != '.')
    pc++;

  issue_to_svr(++pc, preq, NULL);
  free_br(preq);
  }
示例#9
0
void *send_power_state_to_mom(
    
  void *arg)

  {
  struct batch_request  *pRequest = (struct batch_request *)arg;
  struct pbsnode        *pNode = find_nodebyname(pRequest->rq_host);

  if (pNode == NULL)
    {
    free_br(pRequest);
    return NULL;
    }

  int handle = 0;
  int local_errno = 0;

  handle = svr_connect(pNode->nd_addrs[0],pNode->nd_mom_port,&local_errno,pNode,NULL);
  if (handle < 0)
    {
    unlock_node(pNode, __func__, "Error connecting", LOGLEVEL);
    return NULL;
    }

  unlock_node(pNode, __func__, "Done connecting", LOGLEVEL);
  issue_Drequest(handle, pRequest, true);

  return NULL;
  }
示例#10
0
void *req_messagejob(
    
  batch_request *preq) /* I */

  {
  job           *pjob;
  int            rc;
  batch_request *dup_req = NULL;

  if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
    return(NULL);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  /* the job must be running */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);
    
    return(NULL);
    }

  if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
    req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
  /* pass the request on to MOM */
  /* The dup_req is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  else if ((rc = relay_to_mom(&pjob, dup_req, NULL)) != PBSE_NONE)
    {
    req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    free_br(dup_req);
    }
  else
    {
    post_message_req(dup_req);
    free_br(preq);
    }

  /* After MOM acts and replies to us, we pick up in post_message_req() */
  if (pjob == NULL)
    job_mutex.set_lock_on_exit(false);

  return(NULL);
  } /* END req_messagejob() */
示例#11
0
void *check_if_orphaned(

  void *vp)

  {
  char                 *rsv_id = (char *)vp;
  char                  job_id[PBS_MAXSVRJOBID];
  struct batch_request *preq;
  int                   handle = -1;
  int                   retries = 0;
  struct pbsnode       *pnode;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  if (is_orphaned(rsv_id, job_id) == TRUE)
    {
    if((preq = alloc_br(PBS_BATCH_DeleteReservation)) == NULL)
      return NULL;
    preq->rq_extend = rsv_id;

    /* Assume the request will be successful and remove the RSV from the hash table */
    remove_alps_reservation(rsv_id);

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      snprintf(log_buf, sizeof(log_buf),
        "Found orphan ALPS reservation ID %s for job %s; asking %s to remove it",
        rsv_id,
        job_id,
        pnode->nd_name);
      log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, log_buf);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS);
        retries++;
        }

      /* unlock before the network transaction */
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      
      if (handle >= 0)
        issue_Drequest(handle, preq, true);
        
      free_br(preq);
      }
    }
  else
    free(rsv_id);

  return(NULL);
  } /* END check_if_orphaned() */
示例#12
0
void release_req(

  struct work_task *pwt)

  {
  free_br((struct batch_request *)pwt->wt_parm1);

  if (pwt->wt_event != -1)
    svr_disconnect(pwt->wt_event);

  return;
  }
示例#13
0
void issue_track(

    job *pjob)

{
    struct batch_request *preq;
    char                 *pc;
    char                 *sname;
    char                  log_buf[LOCAL_LOG_BUF_SIZE];

    if ((pc = strchr(pjob->ji_qs.ji_jobid, '.')) == NULL)
    {
        snprintf(log_buf, sizeof(log_buf),
                 "Remote job routing is not compatible with display_job_server_suffix set to false. Cannot track %s",
                 pjob->ji_qs.ji_jobid);
        log_err(-1, __func__, log_buf);

        return;
    }

    sname = pc + 1;

    /* do not issue track requests to ourselves */
    if (!strcmp(sname, server_name))
    {
        snprintf(log_buf, sizeof(log_buf),
                 "%s erroneously called for local job %s",
                 __func__, pjob->ji_qs.ji_jobid);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        return;
    }

    preq = alloc_br(PBS_BATCH_TrackJob);

    if (preq == NULL)
        return;

    preq->rq_ind.rq_track.rq_hopcount = pjob->ji_wattr[JOB_ATR_hopcount].at_val.at_long;

    strcpy(preq->rq_ind.rq_track.rq_jid, pjob->ji_qs.ji_jobid);
    strcpy(preq->rq_ind.rq_track.rq_location, server_name);

    preq->rq_ind.rq_track.rq_state[0] = pjob->ji_wattr[JOB_ATR_state].at_val.at_char;

    pc = pjob->ji_qs.ji_jobid;

    while (*pc != '.')
        pc++;

    issue_to_svr(++pc, preq, NULL);
    free_br(preq);
}
示例#14
0
int issue_signal(

  job  **pjob_ptr,
  char  *signame, /* name of the signal to send */
  void  (*func)(batch_request *),
  void  *extra) /* extra parameter to be stored in sig request */

  {
  int                   rc;
  job                  *pjob = *pjob_ptr;
  struct batch_request *newreq;
  char                  jobid[PBS_MAXSVRJOBID + 1];

  /* build up a Signal Job batch request */

  if ((newreq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
    {
    /* FAILURE */

    return(PBSE_SYSTEM);
    }

  newreq->rq_extra = extra;

  strcpy(newreq->rq_ind.rq_signal.rq_jid, pjob->ji_qs.ji_jobid);

  snprintf(newreq->rq_ind.rq_signal.rq_signame, sizeof(newreq->rq_ind.rq_signal.rq_signame), "%s", signame);

  /* The newreq is freed in relay_to_mom (failure)
   * or in issue_Drequest (success) */
  rc = relay_to_mom(&pjob, newreq, NULL);

  if ((rc == PBSE_NONE) &&
      (pjob != NULL))
    {
    strcpy(jobid, pjob->ji_qs.ji_jobid);
    unlock_ji_mutex(pjob, __func__, NULL, 0);
    func(newreq);

    *pjob_ptr = svr_find_job((char *)jobid, TRUE);
    }
  else
    {
    free_br(newreq);

    if (pjob == NULL)
      *pjob_ptr = NULL;
    }

  return(rc);
  }  /* END issue_signal() */
示例#15
0
void remove_stagein(

  job **pjob_ptr)  /* I */

  {

  struct batch_request *preq = 0;
  job                  *pjob = *pjob_ptr;
  u_long                addr;

  preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0);

  if (preq != NULL)
    {
    /* have files to delete  */

    /* change the request type from copy to delete  */

    preq->rq_type = PBS_BATCH_DelFiles;

    preq->rq_extra = NULL;

    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_momport;

    /* The preq is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    if (relay_to_mom(&pjob, preq, NULL) == PBSE_NONE)
      {
      if (pjob != NULL)
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
      }
    else
      {
      /* log that we were unable to remove the files */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_FILE,
        pjob->ji_qs.ji_jobid,
        "unable to remove staged in files for job");
      }

    free_br(preq);
    }

  return;
  }  /* END remove_stagein() */
示例#16
0
void remove_stagein(

  job *pjob)  /* I */

  {

  struct batch_request *preq = 0;
  u_long addr;

  preq = cpy_stage(preq, pjob, JOB_ATR_stagein, 0);

  if (preq != NULL)
    {
    /* have files to delete  */

    /* change the request type from copy to delete  */

    preq->rq_type = PBS_BATCH_DelFiles;

    preq->rq_extra = NULL;

    addr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport;
    addr += pjob->ji_qs.ji_un.ji_exect.ji_momport;

    if (relay_to_mom(
          pjob,
          preq,
          release_req) == 0)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
      }
    else
      {
      /* log that we were unable to remove the files */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_FILE,
        pjob->ji_qs.ji_jobid,
        "unable to remove staged in files for job");

      free_br(preq);
      }
    }

  return;
  }  /* END remove_stagein() */
示例#17
0
void *check_if_orphaned(

  void *vp)

  {
  char                 *rsv_id = (char *)vp;
  struct batch_request *preq;
  int                   handle = -1;
  int                   retries = 0;
  struct pbsnode       *pnode;

  if (is_orphaned(rsv_id) == TRUE)
    {
    preq = alloc_br(PBS_BATCH_DeleteReservation);
    preq->rq_extend = rsv_id;

    if ((pnode = get_next_login_node(NULL)) != NULL)
      {
      struct in_addr hostaddr;
      int            local_errno;
      pbs_net_t      momaddr;

      memcpy(&hostaddr, &pnode->nd_sock_addr.sin_addr, sizeof(hostaddr));
      momaddr = ntohl(hostaddr.s_addr);

      while ((handle < 0) &&
             (retries < 3))
        {
        handle = svr_connect(momaddr, pnode->nd_mom_port, &local_errno, pnode, NULL, ToServerDIS);
        retries++;
        }

      /* unlock before the network transaction */
      unlock_node(pnode, __func__, NULL, 0);
      
      if (handle >= 0)
        {
        issue_Drequest(handle, preq, release_req, 0);
        }
      else
        free_br(preq);
      }
    }
  else
    free(rsv_id);

  return(NULL);
  } /* END check_if_orphaned() */
示例#18
0
int copy_attribute_list(

  batch_request *preq,
  batch_request *preq_tmp)

  {
  svrattrl             *pal = (svrattrl *)GET_NEXT(preq->rq_ind.rq_manager.rq_attr);
  tlist_head           *phead = &preq_tmp->rq_ind.rq_manager.rq_attr;
  svrattrl             *newpal = NULL;

  while (pal != NULL)
    {
    newpal = (svrattrl *)calloc(1, pal->al_tsize + 1);
    if (!newpal)
      {
      free_br(preq_tmp);
      return(PBSE_SYSTEM);
      }

    CLEAR_LINK(newpal->al_link);

    newpal->al_atopl.next = 0;
    newpal->al_tsize = pal->al_tsize + 1;
    newpal->al_nameln = pal->al_nameln;
    newpal->al_flags  = pal->al_flags;
    newpal->al_atopl.name = (char *)newpal + sizeof(svrattrl);
    strcpy((char *)newpal->al_atopl.name, pal->al_atopl.name);
    newpal->al_nameln = pal->al_nameln;
    newpal->al_atopl.resource = newpal->al_atopl.name + newpal->al_nameln;

    if (pal->al_atopl.resource != NULL)
      strcpy((char *)newpal->al_atopl.resource, pal->al_atopl.resource);

    newpal->al_rescln = pal->al_rescln;
    newpal->al_atopl.value = newpal->al_atopl.name + newpal->al_nameln + newpal->al_rescln;
    strcpy((char *)newpal->al_atopl.value, pal->al_atopl.value);
    newpal->al_valln = pal->al_valln;
    newpal->al_atopl.op = pal->al_atopl.op;

    pal = (struct svrattrl *)GET_NEXT(pal->al_link);
    }

  if ((phead != NULL) &&
       (newpal != NULL))
    append_link(phead, &newpal->al_link, newpal);

  return(PBSE_NONE);
  } /* END copy_attribute_list() */
示例#19
0
END_TEST

START_TEST(test_alloc_br)
  {
  batch_request *preq = alloc_br(PBS_BATCH_QueueJob);

  fail_unless(preq->rq_type == PBS_BATCH_QueueJob);
  fail_unless(preq->rq_conn == -1);
  fail_unless(preq->rq_orgconn == -1);
  fail_unless(preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_NULL);
  fail_unless(preq->rq_noreply == FALSE);
  fail_unless(preq->rq_time > 0);

  free_br(preq);
  fail_unless(free_attrlist_called > 0);
  }
void release_req(

  struct work_task *pwt)

  {
  batch_request *preq;
  char          *br_id = pwt->wt_parm1;

  if ((preq = get_remove_batch_request(br_id)) != NULL)
    free_br(preq);

  if (pwt->wt_event != -1)
    svr_disconnect(pwt->wt_event);

  free(pwt->wt_mutex);
  free(pwt);
  } /* END release_req() */
示例#21
0
void post_checkpoint(

  batch_request *preq)

  {
  job *pjob;

  if (preq == NULL)
    return;

  pjob = svr_find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname, FALSE);

  if (preq->rq_reply.brp_code == 0)
    {
    /* checkpointed ok */
    if ((preq->rq_reply.brp_auxcode) && (pjob != NULL)) /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags =
        (pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) |
        JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;

      }
    }
  else
    {
    /* need to try rerun if possible or just abort the job */

    if (pjob)
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

      if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
        rerun_or_kill(&pjob, msg_on_shutdown);
      }
    }

  free_br(preq);

  if (pjob != NULL)
    unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
  }  /* END post_checkpoint() */
示例#22
0
void *req_messagejob(

    void *vp)

{
    struct batch_request *preq = (struct batch_request *)vp;
    job                  *pjob;
    int                   rc;
    struct batch_request *dup_req = NULL;

    if ((pjob = chk_job_request(preq->rq_ind.rq_message.rq_jid, preq)) == NULL)
        return(NULL);

    /* the job must be running */

    if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
        req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

        unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

        return(NULL);
    }

    if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
    {
        req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, NULL);
    }
    /* pass the request on to MOM */
    /* The dup_req is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    else if ((rc = relay_to_mom(&pjob, dup_req, post_message_req)) != 0)
        req_reject(rc, 0, preq, NULL, NULL); /* unable to get to MOM */
    else
        free_br(preq);

    /* After MOM acts and replies to us, we pick up in post_message_req() */
    if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);

    return(NULL);
} /* END req_messagejob() */
示例#23
0
文件: req_modify.c 项目: dhill12/test
void chkpt_xfr_hold(

  struct work_task *ptask)

  {
  job                  *pjob;

  struct batch_request *preq;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];

  preq = get_remove_batch_request(ptask->wt_parm1);

  free(ptask->wt_mutex);
  free(ptask);

  if ((preq == NULL) ||
      (preq->rq_extra == NULL))
    return;

  if ((pjob = svr_find_job(preq->rq_extra, FALSE)) == NULL)
    return;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "BLCR copy completed (state is %s-%s)",
      PJobState[pjob->ji_qs.ji_state],
      PJobSubState[pjob->ji_qs.ji_substate]);

    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  free_br(preq);

  set_task(WORK_Immed, 0, mom_cleanup_checkpoint_hold, strdup(pjob->ji_qs.ji_jobid), FALSE);

  unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);

  return;
  }  /* END chkpt_xfr_hold() */
示例#24
0
文件: reply_send.c 项目: dbeer/torque
int reply_send_svr(
  
  struct batch_request *request)  /* I (freed) */

  {
  int               rc = 0;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  int               sfds = request->rq_conn;  /* socket */

  /* Handle remote replies - local batch requests no longer create work tasks */
  if (sfds >= 0)
    {
    /* Otherwise, the reply is to be sent to a remote client */

    if (request->rq_noreply != TRUE)
      {
      rc = dis_reply_write(sfds, &request->rq_reply);

      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "Reply sent for request type %s on socket %d",
          reqtype_to_txt(request->rq_type),
          sfds);

        log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
        }
      }
    }

  if (((request->rq_type != PBS_BATCH_AsyModifyJob) && 
       (request->rq_type != PBS_BATCH_AsyrunJob) &&
       (request->rq_type != PBS_BATCH_AsySignalJob)) ||
      (request->rq_noreply == TRUE))
    {
    free_br(request);
    }

  return(rc);
  }  /* END reply_send_svr() */
示例#25
0
文件: reply_send.c 项目: dbeer/torque
int reply_send_mom(

  struct batch_request *request)  /* I (freed) */

  {
  int      rc = 0;
  int      sfds = request->rq_conn;  /* socket */

  /* determine where the reply should go, remote or local */

  if (sfds == PBS_LOCAL_CONNECTION)
    {
    rc = PBSE_SYSTEM;
    }
  else if (sfds >= 0)
    {
    /* Otherwise, the reply is to be sent to a remote client */
    rc = dis_reply_write(sfds, &request->rq_reply);
    }
  free_br(request);
  return(rc);
  }  /* END reply_send_mom() */
示例#26
0
文件: req_delete.c 项目: dhill12/test
static void post_delete_mom1(

  struct work_task *pwt)

  {
  int                   delay = 0;
  int                   dellen = strlen(deldelaystr);
  job                  *pjob;

  pbs_queue            *pque;

  char                 *preq_clt_id;
  struct batch_request *preq_sig;         /* signal request to MOM */

  struct batch_request *preq_clt = NULL;  /* original client request */
  int                   rc;
  time_t                time_now = time(NULL);

  preq_sig = get_remove_batch_request((char *)pwt->wt_parm1);
  
  free(pwt->wt_mutex);
  free(pwt);

  if (preq_sig == NULL)
    return;

  rc          = preq_sig->rq_reply.brp_code;
  preq_clt_id = preq_sig->rq_extra;

  free_br(preq_sig);

  if (preq_clt_id != NULL)
    {
    preq_clt = get_remove_batch_request(preq_clt_id);
    free(preq_clt_id);
    }

  /* the client request has been handled another way, nothing left to do */
  if (preq_clt == NULL)
    return;

  pjob = svr_find_job(preq_clt->rq_ind.rq_delete.rq_objname, FALSE);

  if (pjob == NULL)
    {
    /* job has gone away */
    req_reject(PBSE_UNKJOBID, 0, preq_clt, NULL, NULL);

    return;
    }

  if (rc)
    {
    /* mom rejected request */

    if (rc == PBSE_UNKJOBID)
      {
      /* MOM claims no knowledge, so just purge it */
      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "MOM rejected signal during delete");

      /* removed the resources assigned to job */

      free_nodes(pjob);

      set_resc_assigned(pjob, DECR);

      svr_job_purge(pjob);

      reply_ack(preq_clt);
      }
    else
      {
      req_reject(rc, 0, preq_clt, NULL, NULL);

      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      }

    return;
    }

  if (preq_clt->rq_extend)
    {
    if (strncmp(preq_clt->rq_extend, deldelaystr, dellen) == 0)
      {
      delay = atoi(preq_clt->rq_extend + dellen);
      }
    }

  reply_ack(preq_clt);  /* dont need it, reply now */

  /*
   * if no delay specified in original request, see if kill_delay
   * queue attribute is set.
   */
  if (delay == 0)
    {
    if ((pque = get_jobs_queue(&pjob)) != NULL)
      {
      pthread_mutex_lock(server.sv_attr_mutex);
      delay = attr_ifelse_long(&pque->qu_attr[QE_ATR_KillDelay],
                             &server.sv_attr[SRV_ATR_KillDelay],
                             2);
      pthread_mutex_unlock(server.sv_attr_mutex);
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      }
    else if (pjob != NULL)
      return;
    }

  set_task(WORK_Timed, delay + time_now, post_delete_mom2, strdup(pjob->ji_qs.ji_jobid), FALSE);

  /*
   * Since the first signal has succeeded, let's reschedule the
   * nanny to be 1 minute after the second phase.
   */
  apply_job_delete_nanny(pjob, time_now + delay + 60);

  unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
  }  /* END post_delete_mom1() */
示例#27
0
文件: req_delete.c 项目: dhill12/test
int handle_delete_all(

  struct batch_request *preq,
  struct batch_request *preq_tmp,
  char                 *Msg)

  {
  /* don't use the actual request so we can reply about all of the jobs */
  struct batch_request *preq_dup = duplicate_request(preq);
  job                  *pjob;
  int                   iter = -1;
  int                   failed_deletes = 0;
  int                   total_jobs = 0;
  int                   rc = PBSE_NONE;
  char                  tmpLine[MAXLINE];

  preq_dup->rq_noreply = TRUE;
  
  if (preq_tmp != NULL)
    {
    reply_ack(preq_tmp);
    preq->rq_noreply = TRUE; /* set for no more replies */
    }
  
  while ((pjob = next_job(&alljobs, &iter)) != NULL)
    {
    if ((rc = forced_jobpurge(pjob, preq_dup)) == PURGE_SUCCESS)
      {
      continue;
      }

    if (pjob->ji_qs.ji_state >= JOB_STATE_EXITING)
      {
      unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
      
      continue;
      }
    
    total_jobs++;
    
    /* mutex is freed below */
    if (rc == PBSE_NONE)
      {
      if ((rc = execute_job_delete(pjob, Msg, preq_dup)) == PBSE_NONE)
        reply_ack(preq_dup);
       
      /* mark this as NULL because it has been freed */
      preq_dup = NULL;
      }
    
    if (rc != PURGE_SUCCESS)
      {
      /* duplicate the preq so we don't have a problem with double frees */
      preq_dup = duplicate_request(preq);
      preq_dup->rq_noreply = TRUE;
      
      if ((rc == MOM_DELETE) ||
          (rc == ROUTE_DELETE))
        failed_deletes++;
      }
    }
  
  if (failed_deletes == 0)
    {
    reply_ack(preq);

    /* PURGE SUCCESS means this was qdel -p all. In this case no reply_*() 
     * functions have been called */
    if (rc == PURGE_SUCCESS)
      {
      free_br(preq_dup);
      preq_dup = NULL;
      }
    }
  else
    {
    snprintf(tmpLine,sizeof(tmpLine),"Deletes failed for %d of %d jobs",
      failed_deletes,
      total_jobs);
    
    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);
    }
    
  /* preq_dup happens at the end of the loop, so free the extra one if
   * it is there */
  if (preq_dup != NULL)
    free_br(preq_dup);

  return(PBSE_NONE);
  } /* END handle_delete_all() */
示例#28
0
int stat_to_mom(

  char             *job_id,
  struct stat_cntl *cntl)  /* M */

  {
  struct batch_request *newrq;
  int                   rc = PBSE_NONE;
  unsigned long         addr;
  char                  log_buf[LOCAL_LOG_BUF_SIZE+1];
  struct pbsnode       *node;
  int                   handle = -1;
  unsigned long         job_momaddr = -1;
  unsigned short        job_momport = -1;
  char                 *job_momname = NULL;
  job                  *pjob = NULL;

  if ((pjob = svr_find_job(job_id, FALSE)) == NULL)
    return(PBSE_JOBNOTFOUND);

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if ((pjob->ji_qs.ji_un.ji_exect.ji_momaddr == 0) || 
      (!pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str))
    {
    job_mutex.unlock();
    snprintf(log_buf, sizeof(log_buf),
      "Job %s missing MOM's information. Skipping statting on this job", pjob->ji_qs.ji_jobid);
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    return PBSE_BAD_PARAMETER;
    }

  job_momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
  job_momport = pjob->ji_qs.ji_un.ji_exect.ji_momport;
  job_momname = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  job_mutex.unlock();

  if (job_momname == NULL)
    return PBSE_MEM_MALLOC;

  if ((newrq = alloc_br(PBS_BATCH_StatusJob)) == NULL)
    {
    free(job_momname);
    return PBSE_MEM_MALLOC;
    }

  if (cntl->sc_type == 1)
    snprintf(newrq->rq_ind.rq_status.rq_id, sizeof(newrq->rq_ind.rq_status.rq_id), "%s", job_id);
  else
    newrq->rq_ind.rq_status.rq_id[0] = '\0';  /* get stat of all */

  CLEAR_HEAD(newrq->rq_ind.rq_status.rq_attr);

  /* if MOM is down just return stale information */
  addr = job_momaddr;

  node = tfind_addr(addr,job_momport,job_momname);
  free(job_momname);

  if (node == NULL)
    return PBSE_UNKNODE;
  if ((node->nd_state & INUSE_DOWN)||(node->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 6)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
          "node '%s' is allocated to job but in state 'down'",
          node->nd_name);

      log_event(PBSEVENT_SYSTEM,PBS_EVENTCLASS_JOB,job_id,log_buf);
      }

    unlock_node(node, __func__, "no rely mom", LOGLEVEL);
    free_br(newrq);

    return PBSE_NORELYMOM;
    }

  /* get connection to MOM */
  unlock_node(node, __func__, "before svr_connect", LOGLEVEL);
  handle = svr_connect(job_momaddr, job_momport, &rc, NULL, NULL);

  if (handle >= 0)
    {
    if ((rc = issue_Drequest(handle, newrq, true)) == PBSE_NONE)
      {
      stat_update(newrq, cntl);
      }
    }
  else
    rc = PBSE_CONNECT;

  if (rc == PBSE_SYSTEM)
    rc = PBSE_MEM_MALLOC;

  free_br(newrq);

  return(rc);
  }  /* END stat_to_mom() */
示例#29
0
void
process_request(int sfds)
{
	int		      rc;
	struct batch_request *request;
	conn_t		     *conn;


	time_now = time(NULL);

	conn = get_conn(sfds);

	if (!conn) {
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR,
			"process_request", "did not find socket in connection table");
#ifdef WIN32
		(void)closesocket(sfds);
#else
		(void)close(sfds);
#endif
		return;
	}

	if ((request = alloc_br(0)) == NULL) {
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR,
			"process_request", "Unable to allocate request structure");
		close_conn(sfds);
		return;
	}
	request->rq_conn = sfds;

	/*
	 * Read in the request and decode it to the internal request structure.
	 */

	if (get_connecthost(sfds, request->rq_host, PBS_MAXHOSTNAME)) {

		(void)sprintf(log_buffer, "%s: %lu", msg_reqbadhost,
			get_connectaddr(sfds));
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, LOG_DEBUG,
			"", log_buffer);
		req_reject(PBSE_BADHOST, 0, request);
		return;
	}

#ifndef PBS_MOM

	if (conn->cn_active == FromClientDIS) {
		rc = dis_request_read(sfds, request);
	} else {
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, LOG_ERR,
			"process_req", "request on invalid type of connection");
		close_conn(sfds);
		free_br(request);
		return;
	}
#else	/* PBS_MOM */
	rc = dis_request_read(sfds, request);
#endif	/* PBS_MOM */

	if (rc == -1) {		/* End of file */
		close_client(sfds);
		free_br(request);
		return;

	} else if ((rc == PBSE_SYSTEM) || (rc == PBSE_INTERNAL)) {

		/* read error, likely cannot send reply so just disconnect */

		/* ??? not sure about this ??? */

		close_client(sfds);
		free_br(request);
		return;

	} else if (rc > 0) {

		/*
		 * request didn't decode, either garbage or  unknown
		 * request type, in ether case, return reject-reply
		 */

		req_reject(rc, 0, request);
		close_client(sfds);
		return;
	}

#ifndef PBS_MOM
	/* If the request is coming on the socket we opened to the  */
	/* scheduler,  change the "user" from "root" to "Scheduler" */
	if (find_sched_from_sock(request->rq_conn) != NULL) {
		strncpy(request->rq_user, PBS_SCHED_DAEMON_NAME, PBS_MAXUSER);
		request->rq_user[PBS_MAXUSER] = '\0';
	}
#endif	/* PBS_MOM */

	(void)sprintf(log_buffer, msg_request, request->rq_type,
		request->rq_user, request->rq_host, sfds);
	log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_REQUEST, LOG_DEBUG,
		"", log_buffer);

	/* is the request from a host acceptable to the server */
	if (request->rq_type == PBS_BATCH_AuthExternal) {
		rc = authenticate_external(conn, request);
		if (rc == 0)
			reply_ack(request);
		else if (rc == -2)
			req_reject(PBSE_NOSUP, 0, request);
		else
			req_reject(PBSE_BADCRED, 0, request);
		return;
	}

#ifndef PBS_MOM
	if (server.sv_attr[(int)SRV_ATR_acl_host_enable].at_val.at_long) {
		/* acl enabled, check it; always allow myself	*/

		struct pbsnode *isanode = NULL;
		if ((server.sv_attr[SRV_ATR_acl_host_moms_enable].at_flags & ATR_VFLAG_SET) &&
			(server.sv_attr[(int)SRV_ATR_acl_host_moms_enable].at_val.at_long == 1)) {
			isanode = find_nodebyaddr(get_connectaddr(sfds));

			if ((isanode != NULL) && (isanode->nd_state & INUSE_DELETED))
				isanode = NULL;
		}

		if (isanode == NULL) {
			if ((acl_check(&server.sv_attr[(int)SRV_ATR_acl_hosts],
				request->rq_host, ACL_Host) == 0) &&
				(strcasecmp(server_host, request->rq_host) != 0)) {
					req_reject(PBSE_BADHOST, 0, request);
					close_client(sfds);
					return;
			}
                }
	}

	/*
	 * determine source (user client or another server) of request.
	 * set the permissions granted to the client
	 */
	if (conn->cn_authen & PBS_NET_CONN_FROM_PRIVIL) {

		/* request came from another server */

		request->rq_fromsvr = 1;
		request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR |
				   ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
				   ATR_DFLAG_MGRD | ATR_DFLAG_MGWR |
				   ATR_DFLAG_SvWR;

	} else {

		/* request not from another server */

		request->rq_fromsvr = 0;

		/*
		 * Client must be authenticated by a Authenticate User Request,
		 * if not, reject request and close connection.
		 * -- The following is retained for compat with old cmds --
		 * The exception to this is of course the Connect Request which
		 * cannot have been authenticated, because it contains the
		 * needed ticket; so trap it here.  Of course, there is no
		 * prior authentication on the Authenticate User request either,
		 * but it comes over a reserved port and appears from another
		 * server, hence is automatically granted authorization.

		 */

		if (request->rq_type == PBS_BATCH_Connect) {
			req_connect(request);
			return;
		}

		if ((conn->cn_authen & PBS_NET_CONN_AUTHENTICATED) ==0) {
			rc = PBSE_BADCRED;
		} else {
			rc = authenticate_user(request, conn);
		}
		if (rc != 0) {
			req_reject(rc, 0, request);
			if (rc == PBSE_BADCRED)
				close_client(sfds);
			return;
		}

		request->rq_perm =
			svr_get_privilege(request->rq_user, request->rq_host);
	}

	/* if server shutting down, disallow new jobs and new running */

	if (server.sv_attr[(int)SRV_ATR_State].at_val.at_long > SV_STATE_RUN) {
		switch (request->rq_type) {
			case PBS_BATCH_AsyrunJob:
			case PBS_BATCH_JobCred:
			case PBS_BATCH_UserCred:
			case PBS_BATCH_UserMigrate:
			case PBS_BATCH_MoveJob:
			case PBS_BATCH_QueueJob:
			case PBS_BATCH_RunJob:
			case PBS_BATCH_StageIn:
			case PBS_BATCH_jobscript:
				req_reject(PBSE_SVRDOWN, 0, request);
				return;
		}
	}


#else	/* THIS CODE FOR MOM ONLY */

	/* check connecting host against allowed list of ok clients */
	if (!addrfind(conn->cn_addr)) {
		req_reject(PBSE_BADHOST, 0, request);
		close_client(sfds);
		return;
	}

	request->rq_fromsvr = 1;
	request->rq_perm = ATR_DFLAG_USRD | ATR_DFLAG_USWR |
			   ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
			   ATR_DFLAG_MGRD | ATR_DFLAG_MGWR |
			   ATR_DFLAG_SvWR | ATR_DFLAG_MOM;
#endif

	/*
	 * dispatch the request to the correct processing function.
	 * The processing function must call reply_send() to free
	 * the request struture.
	 */

	dispatch_request(sfds, request);
	return;
}
示例#30
0
文件: req_modify.c 项目: dhill12/test
/*
 * modify_whole_array()
 * modifies the entire job array 
 * @SEE req_modify_array PARENT
 */ 
int modify_whole_array(

  job_array *pa,              /* I/O */
  svrattrl  *plist,           /* I */
  struct batch_request *preq, /* I */
  int        checkpoint_req)  /* I */

  {
  int   i;
  int   rc = 0;
  int   mom_relay = 0;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  job  *pjob;

  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;

    if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      /* NO_MOM_RELAY will prevent modify_job from calling relay_to_mom */
      rc = modify_job((void **)&pjob, plist, preq, checkpoint_req, NO_MOM_RELAY);

      if (rc == PBSE_RELAYED_TO_MOM)
        {
        struct batch_request *array_req = NULL;
        /* We told modify_job not to call relay_to_mom
         * so we need to contact the mom */
        rc = copy_batchrequest(&array_req, preq, 0, i);
        if (rc != 0)
          {
          unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
          return(rc);
          }

        preq->rq_refcount++;
        if (mom_relay == 0)
          {
          preq->rq_refcount++;
          }
        mom_relay++;
        /* The array_req is freed in relay_to_mom (failure)
         * or in issue_Drequest (success) */
        if ((rc = relay_to_mom(&pjob, array_req, post_modify_arrayreq)))
          {
          if (pjob != NULL)
            {
            snprintf(log_buf,sizeof(log_buf),
              "Unable to relay information to mom for job '%s'\n",
              pjob->ji_qs.ji_jobid);
            log_err(rc, __func__, log_buf);
            unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL);
            }

          return(rc); /* unable to get to MOM */
          }
        }

      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "3", LOGLEVEL);
      }
    } /* END foreach job in array */

  if (mom_relay)
    {
    preq->rq_refcount--;
    if (preq->rq_refcount == 0)
      {
      free_br(preq);
      }
    return(PBSE_RELAYED_TO_MOM);
    }

  return(rc);
  } /* END modify_whole_array() */