bool cqueue_calculate_summary(const lListElem *cqueue, 
                                     const lList *exechost_list,
                                     const lList *centry_list,
                                     double *load, 
                                     bool *is_load_available, 
                                     u_long32 *used,
                                     u_long32 *resv,
                                     u_long32 *total,
                                     u_long32 *suspend_manual, 
                                     u_long32 *suspend_threshold,
                                     u_long32 *suspend_on_subordinate,
                                     u_long32 *suspend_calendar,
                                     u_long32 *unknown,
                                     u_long32 *load_alarm, 
                                     u_long32 *disabled_manual,
                                     u_long32 *disabled_calendar,
                                     u_long32 *ambiguous,
                                     u_long32 *orphaned,
                                     u_long32 *error,
                                     u_long32 *available,
                                     u_long32 *temp_disabled,
                                     u_long32 *manual_intervention)

{
   bool ret = true;
   
   DENTER(TOP_LAYER, "cqueue_calculate_summary");
   if (cqueue != NULL) {
      lList *qinstance_list = lGetList(cqueue, CQ_qinstances);
      lListElem *qinstance = NULL;
      double host_load_avg = 0.0;
      u_long32 load_slots = 0;
      u_long32 used_available = 0;
      u_long32 used_slots = 0;
      u_long32 resv_slots = 0;

      *load = 0.0;
      *is_load_available = false;
      *used = *total = *resv = 0;
      *available = *temp_disabled = *manual_intervention = 0;
      *suspend_manual = *suspend_threshold = *suspend_on_subordinate = 0;
      *suspend_calendar = *unknown = *load_alarm = 0;
      *disabled_manual = *disabled_calendar = *ambiguous = 0;
      *orphaned = *error = 0; 
      for_each(qinstance, qinstance_list) {
         u_long32 slots = lGetUlong(qinstance, QU_job_slots);
         bool has_value_from_object;

         used_slots = qinstance_slots_used(qinstance);
         resv_slots = qinstance_slots_reserved_now(qinstance);
         (*used) += used_slots;
         (*resv) += resv_slots;
         (*total) += slots;

         if (!sge_get_double_qattr(&host_load_avg, LOAD_ATTR_NP_LOAD_AVG, 
                                   qinstance, exechost_list, centry_list, 
                                   &has_value_from_object)) {
            if (has_value_from_object) {
               *is_load_available = true;
               load_slots += slots;
               *load += host_load_avg * slots;
            } 
         } 

         /*
          * manual_intervention: cdsuE
          * temp_disabled: aoACDS
          */
         if (qinstance_state_is_manual_suspended(qinstance) ||
             qinstance_state_is_unknown(qinstance) ||
             qinstance_state_is_manual_disabled(qinstance) ||
             qinstance_state_is_ambiguous(qinstance) ||
             qinstance_state_is_error(qinstance)) {
            *manual_intervention += slots;
         } else if (qinstance_state_is_alarm(qinstance) ||
                    qinstance_state_is_cal_disabled(qinstance) ||
                    qinstance_state_is_orphaned(qinstance) ||
                    qinstance_state_is_susp_on_sub(qinstance) ||
                    qinstance_state_is_cal_suspended(qinstance) ||
                    qinstance_state_is_suspend_alarm(qinstance)) {
            *temp_disabled += slots;
         } else {
            *available += slots;
            used_available += used_slots;
         }
         if (qinstance_state_is_unknown(qinstance)) {
            *unknown += slots;
         }
         if (qinstance_state_is_alarm(qinstance)) {
            *load_alarm += slots;
         }
         if (qinstance_state_is_manual_disabled(qinstance)) {
            *disabled_manual += slots;
         }
         if (qinstance_state_is_cal_disabled(qinstance)) {
            *disabled_calendar += slots;
         }
         if (qinstance_state_is_ambiguous(qinstance)) {
            *ambiguous += slots;
         }
         if (qinstance_state_is_orphaned(qinstance)) {
            *orphaned += slots;
         }
         if (qinstance_state_is_manual_suspended(qinstance)) {
            *suspend_manual += slots;
         }
         if (qinstance_state_is_susp_on_sub(qinstance)) {
            *suspend_on_subordinate += slots;
         }
         if (qinstance_state_is_cal_suspended(qinstance)) {
            *suspend_calendar += slots;
         }
         if (qinstance_state_is_suspend_alarm(qinstance)) {
            *suspend_threshold += slots;
         }
         if (qinstance_state_is_error(qinstance)) {
            *error += slots;
         }
      }  
Exemple #2
0
/************************************************************************
 Master routine for job exit

 We need a rusage struct filled.
 In normal cases this is done by the execd, sending this structure
 to notify master about job finish.

 In case of an error noticed by the master which needs the job to be 
 removed we can fill this structure by hand. We need:

 rusage->job_number
 rusage->qname to clean up the queue (if we didn't find it we nevertheless
               clean up the job

 for functions regarding rusage see sge_rusage.c
 ************************************************************************/
void sge_job_exit(sge_gdi_ctx_class_t *ctx, lListElem *jr, lListElem *jep, lListElem *jatep, monitoring_t *monitor) 
{
   lListElem *queueep = NULL;
   const char *err_str = NULL;
   const char *qname = NULL;  
   const char *hostname = MSG_OBJ_UNKNOWNHOST;
   u_long32 jobid, jataskid;
   lListElem *hep = NULL;
   u_long32 timestamp;
   object_description *object_base = object_type_get_object_description();

   u_long32 failed, general_failure;
   lList *saved_gdil;

   DENTER(TOP_LAYER, "sge_job_exit");

   /* JG: TODO: we'd prefer some more precise timestamp, e.g. from jr */
   timestamp = sge_get_gmt();
                     
   qname = lGetString(jr, JR_queue_name);
   if (qname == NULL) {
      qname = (char *)MSG_OBJ_UNKNOWNQ;
   }

   err_str = lGetString(jr, JR_err_str);
   if (err_str == NULL) {
      err_str = MSG_UNKNOWNREASON;
   }

   jobid = lGetUlong(jr, JR_job_number);
   jataskid = lGetUlong(jr, JR_ja_task_number);
   failed = lGetUlong(jr, JR_failed);
   general_failure = lGetUlong(jr, JR_general_failure);

   cancel_job_resend(jobid, jataskid);

   /* This only has a meaning for Hibernator jobs. The job pid must
    * be saved accross restarts, since jobs get their old pid
    */
   lSetUlong(jatep, JAT_pvm_ckpt_pid, lGetUlong(jr, JR_job_pid));

   DPRINTF(("reaping job "sge_u32"."sge_u32" in queue >%s< job_pid %d\n", 
      jobid, jataskid, qname, (int) lGetUlong(jatep, JAT_pvm_ckpt_pid)));

   queueep = cqueue_list_locate_qinstance(*object_base[SGE_TYPE_CQUEUE].list, qname);
   if (queueep == NULL) {
      ERROR((SGE_EVENT, MSG_JOB_WRITEJFINISH_S, qname));
   }

   sge_job_remove_enforce_limit_trigger(jobid, jataskid);

   /* retrieve hostname for later use */
   if (queueep != NULL) {
      hostname = lGetHost(queueep, QU_qhostname);
   }

   if (failed) {        /* a problem occured */
      WARNING((SGE_EVENT, MSG_JOB_FAILEDONHOST_UUSSSS, sge_u32c(jobid), 
               sge_u32c(jataskid), 
               hostname,
               general_failure ? MSG_GENERAL : "",
               get_sstate_description(failed), err_str));
   } else {
      INFO((SGE_EVENT, MSG_JOB_JFINISH_UUS,  sge_u32c(jobid), sge_u32c(jataskid), hostname));
   }

   /*-------------------------------------------------*/

   /* test if this job is in state JRUNNING or JTRANSFERING */
   if (lGetUlong(jatep, JAT_status) != JRUNNING && 
       lGetUlong(jatep, JAT_status) != JTRANSFERING) {
      ERROR((SGE_EVENT, MSG_JOB_JEXITNOTRUN_UU, sge_u32c(lGetUlong(jep, JB_job_number)), sge_u32c(jataskid)));
      DRETURN_VOID;
   }

   saved_gdil = lCopyList("cpy", lGetList(jatep, JAT_granted_destin_identifier_list));

   /*
    * case 1: job being trashed because 
    *    --> failed starting interactive job
    *    --> job was deleted
    *    --> a failed batch job that explicitely shall not enter error state
    */
   if (((lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) ||
         (failed && !lGetString(jep, JB_exec_file)) ||
         (failed && general_failure==GFSTATE_JOB && JOB_TYPE_IS_NO_ERROR(lGetUlong(jep, JB_type)))) {
      reporting_create_acct_record(ctx, NULL, jr, jep, jatep, false);
      /* JG: TODO: we need more information in the log message */
      reporting_create_job_log(NULL, timestamp, JL_DELETED, MSG_EXECD, hostname, jr, jep, jatep, NULL, MSG_LOG_JREMOVED);

      sge_commit_job(ctx, jep, jatep, jr, COMMIT_ST_FINISHED_FAILED_EE, COMMIT_DEFAULT | COMMIT_NEVER_RAN, monitor);

      if (lGetUlong(jep, JB_ar) != 0 && (lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) {
         /* get AR and remove it if no other jobs are debited */
         lList *master_ar_list = *object_base[SGE_TYPE_AR].list;
         lListElem *ar = ar_list_locate(master_ar_list, lGetUlong(jep, JB_ar));

         if (ar != NULL && lGetUlong(ar, AR_state) == AR_DELETED) {
            lListElem *ar_queue;
            u_long32 ar_id = lGetUlong(ar, AR_id);

            for_each(ar_queue, lGetList(ar, AR_reserved_queues)) {
               if (qinstance_slots_used(ar_queue) != 0) {
                  break;
               }
            }
            if (ar_queue == NULL) {
               /* no jobs registered in advance reservation */
               dstring buffer = DSTRING_INIT;

               sge_dstring_sprintf(&buffer, sge_U32CFormat,
                                   sge_u32c(ar_id));

               ar_do_reservation(ar, false);

               reporting_create_ar_log_record(NULL, ar, ARL_DELETED, 
                                              "AR deleted",
                                              timestamp);
               reporting_create_ar_acct_records(NULL, ar, timestamp); 

               lRemoveElem(master_ar_list, &ar);

               sge_event_spool(ctx, NULL, 0, sgeE_AR_DEL, 
                      ar_id, 0, sge_dstring_get_string(&buffer), NULL, NULL,
                      NULL, NULL, NULL, true, true);
               sge_dstring_free(&buffer);
            }
         }