Example #1
0
/****** qmaster/setup_qmaster/sge_setup_job_resend() ***************************
*  NAME
*     sge_setup_job_resend() -- Setup job resend events.
*
*  SYNOPSIS
*     void sge_setup_job_resend(void)
*
*  FUNCTION
*     Register a job resend event for each job or array task which does have a
*     'JTRANSFERING' status.
*
*  INPUTS
*     void - none
*
*  RESULT
*     void - none
*
*  NOTES
*     MT-NOTE: sge_setup_job_resend() is not MT safe
*
*******************************************************************************/
void sge_setup_job_resend(void)
{
    lListElem *job = NULL;
    object_description *object_base = object_type_get_object_description();

    DENTER(TOP_LAYER, "sge_setup_job_resend");

    job = lFirst(*object_base[SGE_TYPE_JOB].list);

    while (NULL != job)
    {
        lListElem *task;
        u_long32 job_num;

        job_num = lGetUlong(job, JB_job_number);

        task = lFirst(lGetList(job, JB_ja_tasks));

        while (NULL != task)
        {
            if (lGetUlong(task, JAT_status) == JTRANSFERING)
            {
                lListElem *granted_queue, *qinstance, *host;
                const char *qname;
                u_long32 task_num, when;
                te_event_t ev;

                task_num = lGetUlong(task, JAT_task_number);

                granted_queue = lFirst(lGetList(task, JAT_granted_destin_identifier_list));

                qname = lGetString(granted_queue, JG_qname);

                qinstance = cqueue_list_locate_qinstance(*object_base[SGE_TYPE_CQUEUE].list, qname);

                host = host_list_locate(*object_base[SGE_TYPE_EXECHOST].list, lGetHost(qinstance, QU_qhostname));

                when = lGetUlong(task, JAT_start_time);

                when += MAX(load_report_interval(host), MAX_JOB_DELIVER_TIME);

                ev = te_new_event((time_t)when, TYPE_JOB_RESEND_EVENT, ONE_TIME_EVENT, job_num, task_num, "job-resend_event");
                te_add_event(ev);
                te_free_event(&ev);

                DPRINTF(("Did add job resend for "sge_u32"/"sge_u32" at %d\n", job_num, task_num, when));
            }

            task = lNext(task);
        }

        job = lNext(job);
    }

    DEXIT;
    return;
} /* sge_setup_job_resend() */
Example #2
0
/************************************************************************
 Master routine for job exit

 We need a rusage struct filled.
 In normal cases this is done by the execd, sending this structure
 to notify master about job finish.

 In case of an error noticed by the master which needs the job to be 
 removed we can fill this structure by hand. We need:

 rusage->job_number
 rusage->qname to clean up the queue (if we didn't find it we nevertheless
               clean up the job

 for functions regarding rusage see sge_rusage.c
 ************************************************************************/
void sge_job_exit(sge_gdi_ctx_class_t *ctx, lListElem *jr, lListElem *jep, lListElem *jatep, monitoring_t *monitor) 
{
   lListElem *queueep = NULL;
   const char *err_str = NULL;
   const char *qname = NULL;  
   const char *hostname = MSG_OBJ_UNKNOWNHOST;
   u_long32 jobid, jataskid;
   lListElem *hep = NULL;
   u_long32 timestamp;
   object_description *object_base = object_type_get_object_description();

   u_long32 failed, general_failure;
   lList *saved_gdil;

   DENTER(TOP_LAYER, "sge_job_exit");

   /* JG: TODO: we'd prefer some more precise timestamp, e.g. from jr */
   timestamp = sge_get_gmt();
                     
   qname = lGetString(jr, JR_queue_name);
   if (qname == NULL) {
      qname = (char *)MSG_OBJ_UNKNOWNQ;
   }

   err_str = lGetString(jr, JR_err_str);
   if (err_str == NULL) {
      err_str = MSG_UNKNOWNREASON;
   }

   jobid = lGetUlong(jr, JR_job_number);
   jataskid = lGetUlong(jr, JR_ja_task_number);
   failed = lGetUlong(jr, JR_failed);
   general_failure = lGetUlong(jr, JR_general_failure);

   cancel_job_resend(jobid, jataskid);

   /* This only has a meaning for Hibernator jobs. The job pid must
    * be saved accross restarts, since jobs get their old pid
    */
   lSetUlong(jatep, JAT_pvm_ckpt_pid, lGetUlong(jr, JR_job_pid));

   DPRINTF(("reaping job "sge_u32"."sge_u32" in queue >%s< job_pid %d\n", 
      jobid, jataskid, qname, (int) lGetUlong(jatep, JAT_pvm_ckpt_pid)));

   queueep = cqueue_list_locate_qinstance(*object_base[SGE_TYPE_CQUEUE].list, qname);
   if (queueep == NULL) {
      ERROR((SGE_EVENT, MSG_JOB_WRITEJFINISH_S, qname));
   }

   sge_job_remove_enforce_limit_trigger(jobid, jataskid);

   /* retrieve hostname for later use */
   if (queueep != NULL) {
      hostname = lGetHost(queueep, QU_qhostname);
   }

   if (failed) {        /* a problem occured */
      WARNING((SGE_EVENT, MSG_JOB_FAILEDONHOST_UUSSSS, sge_u32c(jobid), 
               sge_u32c(jataskid), 
               hostname,
               general_failure ? MSG_GENERAL : "",
               get_sstate_description(failed), err_str));
   } else {
      INFO((SGE_EVENT, MSG_JOB_JFINISH_UUS,  sge_u32c(jobid), sge_u32c(jataskid), hostname));
   }

   /*-------------------------------------------------*/

   /* test if this job is in state JRUNNING or JTRANSFERING */
   if (lGetUlong(jatep, JAT_status) != JRUNNING && 
       lGetUlong(jatep, JAT_status) != JTRANSFERING) {
      ERROR((SGE_EVENT, MSG_JOB_JEXITNOTRUN_UU, sge_u32c(lGetUlong(jep, JB_job_number)), sge_u32c(jataskid)));
      DRETURN_VOID;
   }

   saved_gdil = lCopyList("cpy", lGetList(jatep, JAT_granted_destin_identifier_list));

   /*
    * case 1: job being trashed because 
    *    --> failed starting interactive job
    *    --> job was deleted
    *    --> a failed batch job that explicitely shall not enter error state
    */
   if (((lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) ||
         (failed && !lGetString(jep, JB_exec_file)) ||
         (failed && general_failure==GFSTATE_JOB && JOB_TYPE_IS_NO_ERROR(lGetUlong(jep, JB_type)))) {
      reporting_create_acct_record(ctx, NULL, jr, jep, jatep, false);
      /* JG: TODO: we need more information in the log message */
      reporting_create_job_log(NULL, timestamp, JL_DELETED, MSG_EXECD, hostname, jr, jep, jatep, NULL, MSG_LOG_JREMOVED);

      sge_commit_job(ctx, jep, jatep, jr, COMMIT_ST_FINISHED_FAILED_EE, COMMIT_DEFAULT | COMMIT_NEVER_RAN, monitor);

      if (lGetUlong(jep, JB_ar) != 0 && (lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) {
         /* get AR and remove it if no other jobs are debited */
         lList *master_ar_list = *object_base[SGE_TYPE_AR].list;
         lListElem *ar = ar_list_locate(master_ar_list, lGetUlong(jep, JB_ar));

         if (ar != NULL && lGetUlong(ar, AR_state) == AR_DELETED) {
            lListElem *ar_queue;
            u_long32 ar_id = lGetUlong(ar, AR_id);

            for_each(ar_queue, lGetList(ar, AR_reserved_queues)) {
               if (qinstance_slots_used(ar_queue) != 0) {
                  break;
               }
            }
            if (ar_queue == NULL) {
               /* no jobs registered in advance reservation */
               dstring buffer = DSTRING_INIT;

               sge_dstring_sprintf(&buffer, sge_U32CFormat,
                                   sge_u32c(ar_id));

               ar_do_reservation(ar, false);

               reporting_create_ar_log_record(NULL, ar, ARL_DELETED, 
                                              "AR deleted",
                                              timestamp);
               reporting_create_ar_acct_records(NULL, ar, timestamp); 

               lRemoveElem(master_ar_list, &ar);

               sge_event_spool(ctx, NULL, 0, sgeE_AR_DEL, 
                      ar_id, 0, sge_dstring_get_string(&buffer), NULL, NULL,
                      NULL, NULL, NULL, true, true);
               sge_dstring_free(&buffer);
            }
         }
Example #3
0
/************************************************************
  sge_mod_sharetree - Master code

  Modify a share tree
  lListElem *ep,   This is the new share tree 
  lList **lpp,     list to change 
 ************************************************************/
int sge_mod_sharetree(sge_gdi_ctx_class_t *ctx,
                      lListElem *ep, lList **lpp, lList **alpp, 
                      char *ruser, char *rhost ) 
{
   int ret;
   int prev_version;
   int adding; 
   lList *found = NULL;
   object_description *object_base = object_type_get_object_description();

   DENTER(TOP_LAYER, "sge_mod_sharetree");

   /* do some checks */
   if (!ep || !ruser || !rhost) {
      CRITICAL((SGE_EVENT, MSG_SGETEXT_NULLPTRPASSED_S, SGE_FUNC));
      answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
      DEXIT;
      return STATUS_EUNKNOWN;
   }

   ret = check_sharetree(alpp, ep, *object_base[SGE_TYPE_USER].list, 
                         *object_base[SGE_TYPE_PROJECT].list, 
                         NULL, &found);
   lFreeList(&found);
   if (ret) {
      /* alpp gets filled by check_sharetree() */
      DRETURN(STATUS_EUNKNOWN);
   }

   /* check for presence of sharetree list and create if neccesary */
   if (!*lpp) {
      prev_version = 0;
      *lpp = lCreateList("sharetree list", STN_Type);
      adding = 1;
   } else if (lGetNumberOfElem(*lpp) == 0) {
      prev_version = 0;
      adding = 1;
   } else {
      lListElem *first = lFirst(*lpp);
      /* real action: change user or project
         We simply replace the old element with the new one. If there is no
         old we simle add the new. */
     
      prev_version = lGetUlong(lFirst(*lpp), STN_version); 
      lRemoveElem(*lpp, &first);
      adding = 0;
   }

   lSetUlong(ep, STN_version, prev_version+1);

   id_sharetree(alpp, ep, 0, NULL);

   /* now insert new element */
   lAppendElem(*lpp, lCopyElem(ep));
  
   /* write sharetree to file */
   if (!sge_event_spool(ctx,
                        alpp, 0, sgeE_NEW_SHARETREE,
                        0, 0, NULL, NULL, NULL,
                        ep, NULL, NULL, true, true)) {

      /* answer list gets filled in sge_event_spool() */
      DEXIT;
      return ret;
   }

   if (adding) {
      INFO((SGE_EVENT, MSG_STREE_ADDSTREE_SSII, 
         ruser, rhost, lGetNumberOfNodes(ep, NULL, STN_children), lGetNumberOfLeafs(ep, NULL, STN_children)));
   } else {
      INFO((SGE_EVENT, MSG_STREE_MODSTREE_SSII, 
         ruser, rhost, lGetNumberOfNodes(ep, NULL, STN_children), lGetNumberOfLeafs(ep, NULL, STN_children)));
   }

   answer_list_add(alpp, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO);

   DEXIT;
   return STATUS_OK;
}
/****** sge_advance_reservation/ar_validate() **********************************
*  NAME
*     ar_validate() -- validate a advance reservation
*
*  SYNOPSIS
*     bool ar_validate(lListElem *ar, lList **alpp, bool in_master)
*
*  FUNCTION
*     Ensures a new ar has valid start and end times
*
*  INPUTS
*     lListElem *ar   - the ar to check
*     lList **alpp    - answer list pointer
*     bool in_master  - are we in qmaster?
*     bool is_spool   - do we validate for spooling? 
*
*  RESULT
*     bool - true if OK, else false
*
*  NOTES
*     MT-NOTE: ar_validate() is MT safe
*******************************************************************************/
bool ar_validate(lListElem *ar, lList **alpp, bool in_master, bool is_spool)
{
   u_long32 start_time;
   u_long32 end_time;
   u_long32 duration;
   u_long32 now = sge_get_gmt();
   object_description *object_base = object_type_get_object_description();
   
   DENTER(TOP_LAYER, "ar_validate");

   /*   AR_start_time, SGE_ULONG        */
   if ((start_time = lGetUlong(ar, AR_start_time)) == 0) {
      start_time = now;
      lSetUlong(ar, AR_start_time, start_time);
   }

   /*   AR_end_time, SGE_ULONG        */
   end_time = lGetUlong(ar, AR_end_time);
   duration = lGetUlong(ar, AR_duration);
   
   if (end_time == 0 && duration == 0) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_MISSING_VALUE_S, "end time or duration");
      goto ERROR;
   } else if (end_time == 0) {
      end_time = duration_add_offset(start_time, duration);
      duration = end_time  - start_time;
      lSetUlong(ar, AR_end_time, end_time);
      lSetUlong(ar, AR_duration, duration);
   } else if (duration == 0) {
      duration = end_time - start_time;
      lSetUlong(ar, AR_duration, duration);
   }

   if ((end_time - start_time) != duration) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_START_END_DURATION_INVALID);
      goto ERROR;
   }

   if (start_time > end_time) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_START_LATER_THAN_END);
      goto ERROR;
   }
   
   if (!is_spool) {
      if (start_time < now) {
         answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                                 MSG_AR_START_IN_PAST);
         goto ERROR;
      }
   }
   /*   AR_owner, SGE_STRING */
   
   if (in_master) {
      /*    AR_name, SGE_STRING */
      NULL_OUT_NONE(ar, AR_name);
      if (object_verify_name(ar, alpp, AR_name, SGE_OBJ_AR)) {
         goto ERROR;
      }
      /*   AR_account, SGE_STRING */
      NULL_OUT_NONE(ar, AR_account);
      if (!lGetString(ar, AR_account)) {
         lSetString(ar, AR_account, DEFAULT_ACCOUNT);
      } else {
         if (verify_str_key(alpp, lGetString(ar, AR_account), MAX_VERIFY_STRING,
         "account string", QSUB_TABLE) != STATUS_OK) {
            goto ERROR;
         }
      }
      /*   AR_verify, SGE_ULONG              just verify the reservation or final case */
      /*   AR_error_handling, SGE_ULONG      how to deal with soft and hard exceptions */
      /*   AR_checkpoint_name, SGE_STRING    Named checkpoint */
      NULL_OUT_NONE(ar, AR_checkpoint_name);
      {
         /* request for non existing ckpt object will be refused */
         const char *ckpt_name = NULL;

         ckpt_name = lGetString(ar, AR_checkpoint_name);
         if (ckpt_name != NULL) {
            lList *master_ckpt_list = *object_base[SGE_TYPE_CKPT].list;
            lListElem *ckpt_ep = ckpt_list_locate(master_ckpt_list, ckpt_name);
            if (!ckpt_ep) {
               ERROR((SGE_EVENT, MSG_JOB_CKPTUNKNOWN_S, ckpt_name));
               answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
               goto ERROR;
            }
          }
      }
      /*   AR_resource_list, SGE_LIST */
      {
         lList *master_centry_list = *object_base[SGE_TYPE_CENTRY].list;

         if (centry_list_fill_request(lGetList(ar, AR_resource_list),
                                      alpp, master_centry_list, false, true,
                                      false)) {
            goto ERROR;
         }
         if (compress_ressources(alpp, lGetList(ar, AR_resource_list), SGE_OBJ_AR)) {
            goto ERROR;
         }
         
         if (!centry_list_is_correct(lGetList(ar, AR_resource_list), alpp)) {
            goto ERROR;
         }
      }
      /*   AR_queue_list, SGE_LIST */
      if (!qref_list_is_valid(lGetList(ar, AR_queue_list), alpp)) {
         goto ERROR;
      }
      /*   AR_mail_options, SGE_ULONG   */
      /*   AR_mail_list, SGE_LIST */
      
      /*   AR_master_queue_list  -masterq wc_queue_list, SGE_LIST bind master task to queue(s) */
      if (!qref_list_is_valid(lGetList(ar, AR_master_queue_list), alpp)) {
         goto ERROR;
      }
       
      
      /*   AR_pe, SGE_STRING,  AR_pe_range, SGE_LIST */
      NULL_OUT_NONE(ar, AR_pe);
      {
         const char *pe_name = NULL;
         lList *pe_range = NULL;
         
         pe_name = lGetString(ar, AR_pe);
         if (pe_name) {
            const lListElem *pep;
            pep = pe_list_find_matching(*object_base[SGE_TYPE_PE].list, pe_name);
            if (!pep) {
               ERROR((SGE_EVENT, MSG_JOB_PEUNKNOWN_S, pe_name));
               answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
               goto ERROR;
            }
            /* check pe_range */
            pe_range = lGetList(ar, AR_pe_range);
            if (object_verify_pe_range(alpp, pe_name, pe_range, SGE_OBJ_AR)!=STATUS_OK) {
               goto ERROR;
            }
         }
      }

      /*   AR_acl_list, SGE_LIST */
      if (userset_list_validate_access(lGetList(ar, AR_acl_list), ARA_name, alpp) != STATUS_OK) {
         goto ERROR;
      }
      
      /*   AR_xacl_list, SGE_LIST */
      if (userset_list_validate_access(lGetList(ar, AR_xacl_list), ARA_name, alpp) != STATUS_OK) {
         goto ERROR;
      }

      if (is_spool) {
         lListElem *jg;
         dstring cqueue_buffer = DSTRING_INIT;
         dstring hostname_buffer = DSTRING_INIT;
         for_each(jg, lGetList(ar, AR_granted_slots)){
            const char *hostname = NULL;
            const char *qname = lGetString(jg, JG_qname);
            bool has_hostname = false;
            bool has_domain = false;

            cqueue_name_split(qname, &cqueue_buffer, &hostname_buffer,
                              &has_hostname, &has_domain);
            hostname = sge_dstring_get_string(&hostname_buffer);
            lSetHost(jg, JG_qhostname, hostname);
         }
         sge_dstring_free(&cqueue_buffer);
         sge_dstring_free(&hostname_buffer);
      }
      /*   AR_type,  SGE_ULONG     */
      /*   AR_state, SGE_ULONG               state of the AR */
      if(lGetUlong(ar, AR_state) == ARL_UNKNOWN){
         lSetUlong(ar, AR_state, ARL_CREATION);  
      }
   }
   DRETURN(true);

ERROR:
   DRETURN(false);
}