Ejemplo n.º 1
0
void account_jobend(

  job  *pjob,
  char *used) /* job usage information, see req_jobobit() */

  {
  time_t              time_now = time(NULL);
  dynamic_string     *ds;
  char                local_buf[MAXLINE * 4];
#ifdef USESAVEDRESOURCES
  pbs_attribute      *pattr;
  long                walltime_val = 0;
#endif

  /* pack in general information about the job */
  if ((ds = get_dynamic_string(-1, NULL)) == NULL)
    return;

  if ((acct_job(pjob, ds)) != PBSE_NONE)
    {
    free_dynamic_string(ds);
    return;
    }

  /* session */
  sprintf(local_buf, "session=%ld ",
    pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long);

  if (append_dynamic_string(ds, local_buf) != PBSE_NONE)
    {
    free_dynamic_string(ds);
    return;
    }

  /* Alternate id if present */
  if (pjob->ji_wattr[JOB_ATR_altid].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "alt_id=%s ",
      pjob->ji_wattr[JOB_ATR_altid].at_val.at_str);

    if (append_dynamic_string(ds, local_buf) != PBSE_NONE)
      {
      free_dynamic_string(ds);
      return;
      }
    }

  /* add the execution end time */
#ifdef USESAVEDRESOURCES
  pattr = &pjob->ji_wattr[JOB_ATR_resc_used];

  if (pattr->at_flags & ATR_VFLAG_SET)
    {
    resource *pres;
    char     *pname;

    pres = (resource *)GET_NEXT(pattr->at_val.at_list);
    
    /* find the walltime resource */
    for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link))
      {
      pname = pres->rs_defin->rs_name;
      
      if (strcmp(pname, "walltime") == 0)
        {
        /* found walltime */
        walltime_val = pres->rs_value.at_val.at_long;
        break;
        }
      }
    }
  sprintf(local_buf, "end=%ld ", (long)pjob->ji_qs.ji_stime + walltime_val);
#else
  sprintf(local_buf, "end=%ld ", (long)time_now);
#endif /* USESAVEDRESOURCES */

  if (append_dynamic_string(ds, local_buf) != PBSE_NONE)
    {
    free_dynamic_string(ds);
    return;
    }

  /* finally add on resources used from req_jobobit() */
  if (append_dynamic_string(ds, used) != PBSE_NONE)
    {
    free_dynamic_string(ds);
    return;
    }

  account_record(PBS_ACCT_END, pjob, ds->str);

  free_dynamic_string(ds);
  return;
  }  /* END account_jobend() */
Ejemplo n.º 2
0
int finalize_rerunjob(struct batch_request *preq,job *pjob,int rc)
  {
  int     Force;
  char    log_buf[LOCAL_LOG_BUF_SIZE];

  if (preq->rq_extend && !strncasecmp(preq->rq_extend, RERUNFORCE, strlen(RERUNFORCE)))
    Force = 1;
  else
    Force = 0;

  switch (rc)
    {

    case -1:

      /* completed job was requeued */

      /* clear out job completion time if there is one */
      break;

    case 0:

      /* requeue request successful */

      if (pjob != NULL)
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      break;

    case PBSE_SYSTEM: /* This may not be accurate...*/
      rc = PBSE_MEM_MALLOC;
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Can not allocate memory");
      req_reject(rc, 0, preq, NULL, log_buf);
      if (pjob != NULL)
        unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
      return rc;
      break;

    default:

      if (Force == 0)
        {
        rc = PBSE_MOMREJECT;
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Rejected by mom");
        req_reject(rc, 0, preq, NULL, log_buf);
        if (pjob != NULL)
          unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL);
        return rc;
        }
      else
        {
        int           newstate;
        int           newsubst;
        unsigned int  dummy;
        char         *tmp;
        long          cray_enabled = FALSE;
       
        if (pjob != NULL)
          {
          get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

          if ((cray_enabled == TRUE) &&
              (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL))
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &dummy);
          else
            tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &dummy);
          
          /* Cannot communicate with MOM, forcibly requeue job.
             This is a relatively disgusting thing to do */
          
          sprintf(log_buf, "rerun req to %s failed (rc=%d), forcibly requeueing job",
            tmp, rc);

          free(tmp);
  
          log_event(
            PBSEVENT_ERROR | PBSEVENT_ADMIN | PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            log_buf);
          
          log_err(-1, __func__, log_buf);
          
          strcat(log_buf, ", previous output files may be lost");
  
          svr_mailowner(pjob, MAIL_OTHER, MAIL_FORCE, log_buf);
  
          svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_RERUN3, FALSE);
  
          rel_resc(pjob); /* free resc assigned to job */
          
          if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) == 0)
            {
            /* in case of server shutdown, don't clear exec_host */
            /* will use it on hotstart when next comes up        */
            
            job_attr_def[JOB_ATR_exec_host].at_free(&pjob->ji_wattr[JOB_ATR_exec_host]);
  
            job_attr_def[JOB_ATR_session_id].at_free(&pjob->ji_wattr[JOB_ATR_session_id]);
            
            job_attr_def[JOB_ATR_exec_gpus].at_free(&pjob->ji_wattr[JOB_ATR_exec_gpus]);          
            }
          
          pjob->ji_modified = 1;    /* force full job save */
          
          pjob->ji_momhandle = -1;
          pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_StagedIn;
          
          svr_evaljobstate(pjob, &newstate, &newsubst, 0);
          svr_setjobstate(pjob, newstate, newsubst, FALSE);
          }
        }

      break;
    }  /* END switch (rc) */

  /* So job has run and is to be rerun (not restarted) */
  if (pjob == NULL)
    {
    rc = PBSE_JOB_RERUN;
    }
  else
    {
    pjob->ji_qs.ji_svrflags = (pjob->ji_qs.ji_svrflags &
        ~(JOB_SVFLG_CHECKPOINT_FILE |JOB_SVFLG_CHECKPOINT_MIGRATEABLE |
          JOB_SVFLG_CHECKPOINT_COPIED)) | JOB_SVFLG_HASRUN;
    
    sprintf(log_buf, msg_manager, msg_jobrerun, preq->rq_user, preq->rq_host);
    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    reply_ack(preq);
  
    /* note in accounting file */
    account_record(PBS_ACCT_RERUN, pjob, NULL);
    unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL);
    }

  return rc;
  }  /* END req_rerunjob() */
Ejemplo n.º 3
0
int execute_job_delete(

  job                  *pjob,            /* M */
  char                 *Msg,             /* I */
  struct batch_request *preq)            /* I */

  {
  struct work_task *pwtnew;

  int               rc;
  const char      *sigt = "SIGTERM";
  const char      *del = "delete";


  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);
  long              force_cancel = FALSE;
  long              array_compatible = FALSE;

  chk_job_req_permissions(&pjob,preq);

  if (pjob == NULL)
    {
    /* preq is rejected in chk_job_req_permissions here */
    return(-1);
    }

  mutex_mgr job_mutex(pjob->ji_mutex, true);

  if (LOGLEVEL >= 10)
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_QUEUE, __func__, pjob->ji_qs.ji_jobid);

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* see note in req_delete - not sure this is possible still,
     * but the deleted code is irrelevant now. I will leave this
     * part --dbeer */
    return(-1);
    }

  if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2 ||
      pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3 )
    {
    /* If JOB_SUBSTATE_PRERUN being sent to MOM, wait till she gets it going */
    /* retry in one second                            */
    /* If JOB_SUBSTATE_RERUN, RERUN1, RERUN2 or RERUN3 the
       job is being requeued. Wait until finished */

    static time_t  cycle_check_when = 0;
    static char    cycle_check_jid[PBS_MAXSVRJOBID + 1];

    if (cycle_check_when != 0)
      {
      if (!strcmp(pjob->ji_qs.ji_jobid, cycle_check_jid) &&
          (time_now - cycle_check_when > 10))
        {
        /* state not updated after 10 seconds */

        /* did the mom ever get it? delete it anyways... */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;

        goto jump;
        }

      if (time_now - cycle_check_when > 20)
        {
        /* give up after 20 seconds */

        cycle_check_jid[0] = '\0';
        cycle_check_when  = 0;
        }
      }    /* END if (cycle_check_when != 0) */

    if (cycle_check_when == 0)
      {
      /* new PRERUN job located */

      cycle_check_when = time_now;
      strcpy(cycle_check_jid, pjob->ji_qs.ji_jobid);
      }

    sprintf(log_buf, "job cannot be deleted, state=PRERUN, requeuing delete request");

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

    pwtnew = set_task(WORK_Timed,time_now + 1,post_delete_route,preq,FALSE);

    if (pwtnew == NULL)
      {
      req_reject(PBSE_SYSTEM, 0, preq, NULL, NULL);

      return(-1);
      }
    else
      {
      return(ROUTE_DELETE);
      }
    }  /* END if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) */

jump:

  /*
   * Log delete and if requesting client is not job owner, send mail.
   */

  sprintf(log_buf, "requestor=%s@%s", preq->rq_user, preq->rq_host);


  /* NOTE:  should annotate accounting record with extend message (NYI) */
  account_record(PBS_ACCT_DEL, pjob, log_buf);

  sprintf(log_buf, msg_manager, msg_deletejob, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* NOTE:  should incorporate job delete message */

  if (Msg != NULL)
    {
    /* have text message in request extension, add it */
    int len = strlen(log_buf);
    snprintf(log_buf + len, sizeof(log_buf) - len, "\n%s", Msg);
    }

  if ((svr_chk_owner(preq, pjob) != 0) &&
      (pjob->ji_has_delete_nanny == FALSE))
    {
    /* only send email if owner did not delete job and job deleted
       has not been previously attempted */

    svr_mailowner(pjob, MAIL_DEL, MAIL_FORCE, log_buf);
    /*
     * If we sent mail and already sent the extra message
     * then reset message so we don't trigger a redundant email
     * in job_abt()
    */

    if (Msg != NULL)
      {
      Msg = NULL;
      }
    }

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, change restart comment if failed */

    change_restart_comment_if_needed(pjob);
    }

  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    /*
     * setup a nanny task to make sure the job is actually deleted (see the
     * comments at job_delete_nanny()).
     */

    if (pjob->ji_has_delete_nanny == TRUE)
      {
      req_reject(PBSE_IVALREQ, 0, preq, NULL, "job cancel in progress");

      return(-1);
      }

    apply_job_delete_nanny(pjob, time_now + 60);

    /*
     * Send signal request to MOM.  The server will automagically
     * pick up and "finish" off the client request when MOM replies.
     */
    get_batch_request_id(preq);

    if ((rc = issue_signal(&pjob, sigt, post_delete_mom1,strdup(del), strdup(preq->rq_id))))
      {
      /* cant send to MOM */

      req_reject(rc, 0, preq, NULL, NULL);
      }

    /* normally will ack reply when mom responds */
    if (pjob != NULL)
      {
      sprintf(log_buf, msg_delrunjobsig, sigt);
      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    else
      job_mutex.set_unlock_on_exit(false);

    return(-1);
    }  /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* make a cleanup task if set */
  get_svr_attr_l(SRV_ATR_JobForceCancelTime, &force_cancel);
  if (force_cancel > 0)
    {
    char *dup_jobid = strdup(pjob->ji_qs.ji_jobid);
 
    set_task(WORK_Timed, time_now + force_cancel, ensure_deleted, dup_jobid, FALSE);    
    }

  /* if configured, and this job didn't have a slot limit hold, free a job
   * held with the slot limit hold */
  get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &array_compatible);
  if ((array_compatible != FALSE) &&
      ((pjob->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l) == FALSE))
    {
    if ((pjob->ji_arraystructid[0] != '\0') &&
        (pjob->ji_is_array_template == FALSE))
      {
      int        i;
      int        newstate;
      int        newsub;
      job       *tmp;
      job_array *pa = get_jobs_array(&pjob);

      if (pjob == NULL)
        {
        job_mutex.set_unlock_on_exit(false);
        return(-1);
        }
      std::string dup_job_id(pjob->ji_qs.ji_jobid);

      if (pa != NULL)
        {
        for (i = 0; i < pa->ai_qs.array_size; i++)
          {
          if (pa->job_ids[i] == NULL)
            continue;

          if (!strcmp(pa->job_ids[i], pjob->ji_qs.ji_jobid))
            continue;

          job_mutex.unlock();
          if ((tmp = svr_find_job(pa->job_ids[i], FALSE)) == NULL)
            {
            free(pa->job_ids[i]);
            pa->job_ids[i] = NULL;
            }
          else
            {
            if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long & HOLD_l)
              {
              tmp->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_l;

              if (tmp->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
                {
                tmp->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
                }

              svr_evaljobstate(*tmp, newstate, newsub, 1);
              svr_setjobstate(tmp, newstate, newsub, FALSE);
              job_save(tmp, SAVEJOB_FULL, 0);

              unlock_ji_mutex(tmp, __func__, "5", LOGLEVEL);
              pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE);  //Job might have disappeared.
              job_mutex.set_lock_state(true);

              break;
              }

            unlock_ji_mutex(tmp, __func__, "6", LOGLEVEL);
            }
          if ((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) == NULL) //Job disappeared.
            {
            break;
            }
          job_mutex.set_lock_state(true);
          }

        if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
          {
          long job_atr_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;
          int job_exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat;
          int job_state = pjob->ji_qs.ji_state;

          job_mutex.unlock();
          update_array_values(pa,job_state,aeTerminate,
            (char*)dup_job_id.c_str(), job_atr_hold, job_exit_status);

          if((pjob = svr_find_job((char *)dup_job_id.c_str(),FALSE)) != NULL)
            job_mutex.mark_as_locked();
          }

        unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
        }
      }
    } /* END MoabArrayCompatible check */

  if (pjob == NULL)
    {
    job_mutex.set_unlock_on_exit(false);
    return -1;
    }

  depend_on_term(pjob);

  if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) != 0)
    {
    /* job has restart file at mom, do end job processing */
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_EXITING, FALSE);

    /* force new connection */
    pjob->ji_momhandle = -1;

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "calling on_job_exit from %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
      }

    set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE);
    }
  else if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn) != 0)
    {
    /* job has staged-in file, should remove them */
    remove_stagein(&pjob);

    job_mutex.set_unlock_on_exit(false);

    if (pjob != NULL)
      job_abt(&pjob, Msg);
    }

  delete_inactive_job(&pjob, Msg);

  if (pjob == NULL)
    job_mutex.set_unlock_on_exit(false);

  return(PBSE_NONE);
  } /* END execute_job_delete() */
Ejemplo n.º 4
0
static void process_hold_reply(

  struct work_task *pwt)
  {
  job       *pjob;

  struct batch_request *preq;
  int   newstate;
  int   newsub;
  attribute temphold;
  char *pset;
  int rc;

  svr_disconnect(pwt->wt_event); /* close connection to MOM */

  preq = pwt->wt_parm1;
  preq->rq_conn = preq->rq_orgconn;  /* restore client socket */

  if ((pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname)) == (job *)0)
    {
    LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
              preq->rq_ind.rq_hold.rq_orig.rq_objname,
              msg_postmomnojob);
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
    }
  else if (preq->rq_reply.brp_code != 0)
    {

    rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold);

    if (rc == 0)
      {
      rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold],
           &temphold, DECR);
      }

    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;  /* reset it */

    pjob->ji_modified = 1;    /* indicate attributes changed */
    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub); /* saves job */

    if (preq->rq_reply.brp_code != PBSE_NOSUP)
      {
      sprintf(log_buffer, msg_mombadhold, preq->rq_reply.brp_code);
      LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
                pjob->ji_qs.ji_jobid, log_buffer);
      req_reject(preq->rq_reply.brp_code, 0, preq, NULL, log_buffer);
      }
    else
      {
      reply_ack(preq);
      }
    }
  else
    {
    /* record that MOM has a checkpoint file */

    /* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
     * And if these flags are not set, start_exec will not try to run the job from
     * the checkpoint image file.
     */

    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;

    if (preq->rq_reply.brp_auxcode)  /* checkpoint can be moved */
      {
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
      pjob->ji_qs.ji_svrflags |=  JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
      }

    pjob->ji_modified = 1;    /* indicate attributes changed     */

    svr_evaljobstate(pjob, &newstate, &newsub, 0);
    svr_setjobstate(pjob, newstate, newsub); /* saves job */

    account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
    reply_ack(preq);
    }
  }