Exemplo n.º 1
0
int issue_to_svr(

  char                 *servern,                  /* I */
  struct batch_request *preq,                     /* I */
  void (*replyfunc)    (struct work_task *))      /* I */

  {
  int   do_retry = 0;
  int   handle;
  pbs_net_t svraddr;
  char  *svrname;
  unsigned int  port = pbs_server_port_dis;

  struct work_task *pwt;

  strcpy(preq->rq_host, servern);

  preq->rq_fromsvr = 1;
  preq->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR | ATR_DFLAG_SvWR;

  svrname = parse_servername(servern, &port); 
  svraddr = get_hostaddr(svrname);

  if (svraddr == (pbs_net_t)0)
    {
    if (pbs_errno == PBS_NET_RC_RETRY)
      {
      /* Non fatal error - retry */

      do_retry = 1;
      }
    }
  else
    {
    handle = svr_connect(svraddr, port, process_Dreply, ToServerDIS);

    if (handle >= 0)
      {
      return(issue_Drequest(handle, preq, replyfunc, NULL));
      }
    else if (handle == PBS_NET_RC_RETRY)
      {
      do_retry = 1;
      }
    }

  /* if reached here, it didn`t go, do we retry? */

  if (do_retry)
    {
    pwt = set_task(
            WORK_Timed,
            (long)(time_now + PBS_NET_RETRY_TIME),
            reissue_to_svr,
            (void *)preq);

    pwt->wt_parmfunc = replyfunc;

    return(0);
    }

  /* FAILURE */

  return(-1);
  }  /* END issue_to_svr() */
Exemplo n.º 2
0
int finalize_rerunjob(

    batch_request *preq,
    job           *pjob,
    int            rc)

{
    int       Force;
    char      log_buf[LOCAL_LOG_BUF_SIZE];

    if (pjob == NULL)
        return(PBSE_BAD_PARAMETER);

    mutex_mgr pjob_mutex = mutex_mgr(pjob->ji_mutex, true);

    if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
        Force = 1;
    else
        Force = 0;

    switch (rc)
    {

    case -1:

        /* completed job was requeued */

        /* clear out job completion time if there is one */
        break;

    case 0:

        /* requeue request successful */

        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

        break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
        rc = PBSE_MEM_MALLOC;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
        req_reject(rc, 0, preq, NULL, log_buf);
        return rc;
        break;

    default:

        if (Force == 0)
        {
            rc = PBSE_MOMREJECT;
            snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
            req_reject(rc, 0, preq, NULL, log_buf);
            return rc;
        }
        else
        {
            int           newstate;
            int           newsubst;
            unsigned int  dummy;
            char         *tmp;
            long          cray_enabled = FALSE;

            get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

            if ((cray_enabled == TRUE) &&
                    (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
                tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
            else
                tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);

            /* Cannot communicate with MOM, forcibly requeue job.
               This is a relatively disgusting thing to do */

            sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
                    tmp, rc);

            free(tmp);

            log_event(
                PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
                PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid,
                log_buf);

            log_err(-1, __func__, log_buf);

            strcat(log_buf, ", previous output files may be lost");

            svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);

            svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);

            rel_resc(pjob); /* free resc assigned to job */

            pjob->ji_modified = 1;    /* force full job save */

            pjob->ji_momhandle = -1;
            pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;

            svr_evaljobstate(*pjob, newstate, newsubst, 0);
            svr_setjobstate(pjob, newstate, newsubst, FALSE);
        }

        break;
    }  /* END switch (rc) */

    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
                               ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
                                 JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;

    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);

    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);

    return rc;
}  /* END req_rerunjob() */