/****** gdi/request_internal/sge_gdi_packet_get_pb_size() ********************
*  NAME
*     sge_gdi_packet_get_pb_size() -- returns the needed puckbuffer size 
*
*  SYNOPSIS
*     u_long32 sge_gdi_packet_get_pb_size(sge_gdi_packet_class_t *packet) 
*
*  FUNCTION
*     This function checks how lage a packbuffer needs to be so that
*     the given "packet" can be packed completely. 
*
*     This operation is needed to avoid continuous reallocation
*     when the real buffer should be allocated. 
*
*  INPUTS
*     sge_gdi_packet_class_t *packet - packet pointer 
*
*  RESULT
*     u_long32 - size in byte 
*
*  NOTES
*     MT-NOTE: sge_gdi_packet_get_pb_size() is MT safe 
*
*  SEE ALSO
*     gdi/request_internal/sge_gdi_packet_pack_task() 
*******************************************************************************/
u_long32
sge_gdi_packet_get_pb_size(sge_gdi_packet_class_t *packet)
{
   u_long32 ret = 0;

   DENTER(TOP_LAYER, "sge_gdi_packet_get_pb_size");
   if (packet != NULL) {
      bool local_ret;
      lList *local_answer_list = NULL;
      sge_pack_buffer pb;

      init_packbuffer(&pb, 0, 1);
      local_ret = sge_gdi_packet_pack(packet, &local_answer_list, &pb);
      if (local_ret) {
         ret = pb_used(&pb);
      }
      clear_packbuffer(&pb);
   }
   DRETURN(ret);
}
Exemple #2
0
int sge_send_ack_to_qmaster(sge_gdi_ctx_class_t *ctx, u_long32 type, u_long32 ulong_val, 
                            u_long32 ulong_val_2, const char *str, lList **alpp)
{
   int ret;
   sge_pack_buffer pb;
   const char* commproc = prognames[QMASTER];
   const char* rhost = ctx->get_master(ctx, false);
   int         id   = 1;
   
   DENTER(TOP_LAYER, "sge_gdi2_send_ack_to_qmaster");

   /* send an ack to the qmaster for the events */
   if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) {
      DRETURN(CL_RETVAL_MALLOC);
   }

   pack_ack(&pb, type, ulong_val, ulong_val_2, str);

   ret = sge_gdi2_send_any_request(ctx, 0, NULL, rhost, commproc, id, &pb, TAG_ACK_REQUEST, 0, alpp);
   clear_packbuffer(&pb);
   answer_list_output (alpp);

   DRETURN(ret);
}
void *
sge_worker_main(void *arg)
{
   bool do_endlessly = true;
   cl_thread_settings_t *thread_config = (cl_thread_settings_t*)arg;
   sge_gdi_ctx_class_t *ctx = NULL;
   monitoring_t monitor;
   monitoring_t *monitorp = &monitor;
   time_t next_prof_output = 0;

   DENTER(TOP_LAYER, "sge_worker_main");

   DPRINTF(("started"));
   cl_thread_func_startup(thread_config);
   sge_monitor_init(&monitor, thread_config->thread_name, GDI_EXT, MT_WARNING, MT_ERROR);
   sge_qmaster_thread_init(&ctx, QMASTER, WORKER_THREAD, true);

   /* register at profiling module */
   set_thread_name(pthread_self(), "Worker Thread");
   conf_update_thread_profiling("Worker Thread");
 
   while (do_endlessly) {
      sge_gdi_packet_class_t *packet = NULL;

      /*
       * Wait for packets. As long as packets are available cancelation 
       * of this thread is ignored. The shutdown procedure in the main 
       * thread takes care that packet producers will be terminated 
       * before all worker threads so that this won't be a problem.
       */
      MONITOR_IDLE_TIME(
         sge_tq_wait_for_task(Master_Task_Queue, 1, SGE_TQ_GDI_PACKET, (void *)&packet),
         &monitor, mconf_get_monitor_time(), mconf_is_monitor_message());

      MONITOR_SET_QLEN((monitorp), sge_tq_get_task_count(Master_Task_Queue));

      if (packet != NULL) {
         sge_gdi_task_class_t *task = packet->first_task;
         bool is_only_read_request = true;

         thread_start_stop_profiling();

#ifdef SEND_ANSWER_IN_LISTENER
#else
         /*
          * prepare buffer for sending an answer 
          */
         if (packet->is_intern_request == false && packet->is_gdi_request == true) {
            init_packbuffer(&(packet->pb), 0, 0);
         }
#endif

         MONITOR_MESSAGES((monitorp));

         if (packet->is_gdi_request == true) {
            /*
             * test if a write lock is necessary
             */
            task = packet->first_task;
            while (task != NULL) {
               u_long32 command = SGE_GDI_GET_OPERATION(task->command); 

               if (command != SGE_GDI_GET) {
                  is_only_read_request = false;
                  break;
               }
               task = task->next;            
            }
         } else {
            is_only_read_request = false;
         }

         /*
          * acquire the correct lock
          */
         if (is_only_read_request) {
            MONITOR_WAIT_TIME(SGE_LOCK(LOCK_GLOBAL, LOCK_READ), monitorp);
         } else {
            MONITOR_WAIT_TIME(SGE_LOCK(LOCK_GLOBAL, LOCK_WRITE), monitorp);
         }

         if (packet->is_gdi_request == true) {
            /*
             * do the GDI request
             */
            task = packet->first_task;
            while (task != NULL) {
               sge_c_gdi(ctx, packet, task, &(task->answer_list), &monitor);

               task = task->next;
            }
         } else {
            task = packet->first_task;
            sge_c_report(ctx, packet->host, packet->commproc, packet->commproc_id, 
                         task->data_list, &monitor);
         }

         /*
          * do unlock
          */
         if (is_only_read_request) {
            SGE_UNLOCK(LOCK_GLOBAL, LOCK_READ)
         } else {
            SGE_UNLOCK(LOCK_GLOBAL, LOCK_WRITE)
         }

         if (packet->is_gdi_request == true) {
#ifdef SEND_ANSWER_IN_LISTENER
            sge_gdi_packet_broadcast_that_handled(packet);
#else
            /*
             * Send the answer to the client
             */
            if (packet->is_intern_request == false) {
               MONITOR_MESSAGES_OUT(monitorp);
               sge_gdi2_send_any_request(ctx, 0, NULL,
                                         packet->host, packet->commproc, packet->commproc_id, 
                                         &(packet->pb), TAG_GDI_REQUEST, 
                                         packet->response_id, NULL);
               clear_packbuffer(&(packet->pb));
#  ifdef BLOCK_LISTENER
               sge_gdi_packet_broadcast_that_handled(packet);
#  else
               sge_gdi_packet_free(&packet);
#  endif
               /*
                * Code only for TS: 
                *
                * Following if-block will only be executed in testsuite if the qmaster
                * parameter __TEST_SLEEP_AFTER_REQUEST is defined. This will block the
                * worker thread if it handled a request. Only this makes sure that
                * other worker threads can handle incoming requests. Otherwise
                * it might be possible that one worker threads handles all requests
                * on fast qmaster hosts if testsuite is not fast enough to generate
                * gdi requests.
                */
               if (mconf_get_enable_test_sleep_after_request() == true) {
                  sleep(5);
               }
            } else {
               sge_gdi_packet_broadcast_that_handled(packet);
               /* this is an internal request, packet will get destroyed later,
                * where the caller waits for the answer
                * make sure it is no longer accessed here
                */
               packet = NULL;
            }
#endif
         } else {
            sge_gdi_packet_free(&packet);
         }
     
         thread_output_profiling("worker thread profiling summary:\n",
                                 &next_prof_output);

         sge_monitor_output(&monitor);
      } else { 
/****** sge_c_report() *******************************************************
*  NAME
*     sge_c_report() -- process execd load report
*
*  SYNOPSIS
*     void sge_c_report(char *rhost, char *commproc, int id, lList *report_list)
*
*  FUNCTION
*
*  INPUTS
*     char *rhost
*     char *commproc
*     int id
*     lList *report_list
*
*  RESULT
*     void - nothing
*
*  NOTES
*     MT-NOTE: sge_c_report() is MT safe
*
******************************************************************************/
void sge_c_report(sge_gdi_ctx_class_t *ctx, char *rhost, char *commproc, int id, lList *report_list, monitoring_t *monitor)
{
   lListElem *hep = NULL;
   u_long32 rep_type;
   lListElem *report;
   int ret = 0;
   u_long32 this_seqno, last_seqno;
   u_long32 rversion;
   sge_pack_buffer pb;   
   bool is_pb_used = false;
   bool send_tag_new_conf = false;

   DENTER(TOP_LAYER, "sge_c_report");

   if (lGetNumberOfElem(report_list) == 0) {
      DPRINTF(("received empty report\n"));
      if (rhost != NULL) {
         WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, rhost));
      } else {
         WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, "unknown"));
      }
      DRETURN_VOID;
   }

   /* accept reports only from execd's */
   if (strcmp(prognames[EXECD], commproc)) {
      ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNCOMMPROC_S, commproc));
      DRETURN_VOID;
   }

   /* do not process load reports from old execution daemons */
   rversion = lGetUlong(lFirst(report_list), REP_version);
   if (verify_request_version(NULL, rversion, rhost, commproc, id)) {
      DRETURN_VOID;
   }
   
   this_seqno = lGetUlong(lFirst(report_list), REP_seqno);

   /* need exec host for all types of reports */
   if (!(hep = host_list_locate(*object_type_get_master_list(SGE_TYPE_EXECHOST), rhost))) {
      ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNEXECHOST_S, rhost));
      DRETURN_VOID;
   }

   /* prevent old reports being proceeded 
      frequent loggings of outdated reports can be an indication 
      of too high message traffic arriving at qmaster */ 
   last_seqno = lGetUlong(hep, EH_report_seqno);

   if ((this_seqno < last_seqno && (last_seqno - this_seqno) <= 9000) &&
      !(last_seqno > 9990 && this_seqno < 10)) {
      /* this must be an old report, log and then ignore it */
      INFO((SGE_EVENT, MSG_QMASTER_RECEIVED_OLD_LOAD_REPORT_UUS, 
               sge_u32c(this_seqno), sge_u32c(last_seqno), rhost));
      DRETURN_VOID;
   }
   
   lSetUlong(hep, EH_report_seqno, this_seqno);

   /* RU: */
   /* tag all reschedule_unknown list entries we hope to 
      hear about in that job report */
   update_reschedule_unknown_list(ctx, hep);

   /*
   ** process the reports one after the other
   ** usually there will be a load report
   ** and a configuration version report
   */
   for_each(report, report_list) {
      rep_type = lGetUlong(report, REP_type);

      switch (rep_type) {
      case NUM_REP_REPORT_LOAD:
      case NUM_REP_FULL_REPORT_LOAD:
         MONITOR_ELOAD(monitor); 
         /* Now handle execds load reports */
         if (lGetUlong(hep, EH_lt_heard_from) == 0 && rep_type != NUM_REP_FULL_REPORT_LOAD) {
            host_notify_about_full_load_report(ctx, hep);
         } else {
            if (!is_pb_used) {
               is_pb_used = true;
               init_packbuffer(&pb, 1024, 0);
            }
            sge_update_load_values(ctx, rhost, lGetList(report, REP_list));

            if (mconf_get_simulate_execds()) {
               lList *master_exechost_list = *object_type_get_master_list(SGE_TYPE_EXECHOST);
               lListElem *shep;
               lListElem *simhostElem=NULL; 

               for_each(shep, master_exechost_list) {
                  simhostElem = lGetSubStr(shep, CE_name, "load_report_host", EH_consumable_config_list);
                  if (simhostElem != NULL) {
                     const char *real_host = lGetString(simhostElem, CE_stringval);
                     if (real_host != NULL && sge_hostcmp(real_host, rhost) == 0) {
                        const char* sim_host = lGetHost(shep, EH_name);
                        lListElem *clp = NULL;

                        DPRINTF(("Copy load values of %s to simulated host %s\n",
                                rhost, sim_host));

                        for_each(clp, lGetList(report, REP_list)) {
                           if (strcmp(lGetHost(clp, LR_host), SGE_GLOBAL_NAME) != 0) {
                              lSetHost(clp, LR_host, sim_host);
                           }
                        }
                        sge_update_load_values(ctx, sim_host, lGetList(report, REP_list));
                     }
                  }
               }
            }

            pack_ack(&pb, ACK_LOAD_REPORT, this_seqno, 0, NULL);
         }
         break;
      case NUM_REP_REPORT_CONF: 
         MONITOR_ECONF(monitor); 
         if (sge_compare_configuration(hep, lGetList(report, REP_list)) != 0) {
            DPRINTF(("%s: configuration on host %s is not up to date\n", SGE_FUNC, rhost));
            send_tag_new_conf = true;
         }
         break;
         
      case NUM_REP_REPORT_PROCESSORS:
         /*
         ** save number of processors
         */
         MONITOR_EPROC(monitor);
         ret = update_license_data(ctx, hep, lGetList(report, REP_list)); 
         if (ret) {
            ERROR((SGE_EVENT, MSG_LICENCE_ERRORXUPDATINGLICENSEDATA_I, ret));
         }
         break;

      case NUM_REP_REPORT_JOB:
         MONITOR_EJOB(monitor);
         if (!is_pb_used) {
            is_pb_used = true;
            init_packbuffer(&pb, 1024, 0);
         }
         process_job_report(ctx, report, hep, rhost, commproc, &pb, monitor);
         break;

      default:   
         DPRINTF(("received invalid report type %ld\n", (long) rep_type));
      }
Exemple #5
0
/****** gdi/sge/sge_qexecve() ************************************************
*  NAME
*     sge_qexecve() -- start a task in a tightly integrated par. job
*
*  SYNOPSIS
*     sge_tid_t sge_qexecve(const char *hostname, const char *queuename,
*                           const char *cwd, const lList *environment
*                           const lList *path_aliases)
*
*  FUNCTION
*     Starts a task in a tightly integrated job.
*     Builds a job object describing the task,
*     connects to the commd on the targeted execution host,
*     deliveres the job object and waits for an answer.
*     The answer from the execution daemon on the execution host
*     contains a task id that is returned to the caller of the function.
*
*  INPUTS
*     const char *hostname - name of the host on which to start the task
*     const lList *environment  - list containing environment variable
*                            settings for the task that override the
*                            default environment
*     const lList *path_aliases - optional a path alias list
*
*  RESULT
*     sge_tid_t - the task id, if the task can be executed,
*                 a value <= 0 indicates an error.
*
*  NOTES
*     MT-NOTE: sge_qexecve() is not MT safe
******************************************************************************/
sge_tid_t sge_qexecve(sge_gdi_ctx_class_t *ctx,
                      const char *hostname, const char *queuename,
                      const char *cwd, const lList *environment,
                      const lList *path_aliases)
{
    char myname[256];
    const char *s;
    int ret, uid;
    sge_tid_t tid = NULL;
    lListElem *petrep;
    lListElem *rt;
    sge_pack_buffer pb;
    u_long32 jobid, jataskid;
    u_long32 dummymid = 0;
    const char *env_var_name = "SGE_TASK_ID";

    DENTER(TOP_LAYER, "sge_qexecve");

    if (hostname == NULL) {
        sprintf(lasterror, MSG_GDI_INVALIDPARAMETER_SS, "sge_qexecve", "hostname");
        DRETURN(NULL);
    }

    /* resolve user */
    if (sge_uid2user((uid=getuid()), myname, sizeof(myname)-1, MAX_NIS_RETRIES)) {
        sprintf(lasterror, MSG_GDI_RESOLVINGUIDTOUSERNAMEFAILED_IS ,
                uid, strerror(errno));
        DRETURN(NULL);
    }

    if ((s=getenv("JOB_ID")) == NULL) {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, "JOB_ID");
        DRETURN(NULL);
    }

    if (sscanf(s, sge_u32, &jobid) != 1) {
        sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, "JOB_ID");
        DRETURN(NULL);
    }

    if ((s=getenv(env_var_name)) != NULL) {
        if (strcmp(s, "undefined") == 0) {
            jataskid = 1;
        } else {
            if (sscanf(s, sge_u32, &jataskid) != 1) {
                sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, env_var_name);
                DRETURN(NULL);
            }
        }
    } else {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, env_var_name);
        DRETURN(NULL);
    }

    /* ---- build up pe task request structure (see gdilib/sge_petaskL.h) */
    petrep = lCreateElem(PETR_Type);

    lSetUlong(petrep, PETR_jobid, jobid);
    lSetUlong(petrep, PETR_jataskid, jataskid);
    lSetString(petrep, PETR_owner, myname);
    lSetUlong(petrep, PETR_submission_time, sge_get_gmt());

    if (cwd != NULL) {
        lSetString(petrep, PETR_cwd, cwd);
    }

    if (environment != NULL) {
        lSetList(petrep, PETR_environment, lCopyList("environment", environment));
    }

    if (path_aliases != NULL) {
        lSetList(petrep, PETR_path_aliases, lCopyList("path_aliases", path_aliases));
    }


    if (queuename != NULL) {
        lSetString(petrep, PETR_queuename, queuename);
    }

    if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) {
        lFreeElem(&petrep);
        sprintf(lasterror, SFNMAX, MSG_GDI_OUTOFMEMORY);
        DRETURN(NULL);
    }

    pack_job_delivery(&pb, petrep);

    ret = gdi2_send_message_pb(ctx, 1, prognames[EXECD], 1, hostname,
                               TAG_JOB_EXECUTION, &pb, &dummymid);

    clear_packbuffer(&pb);

    lFreeElem(&petrep);

    if (ret != CL_RETVAL_OK) {
        sprintf(lasterror, MSG_GDI_SENDTASKTOEXECDFAILED_SS, hostname, cl_get_error_text(ret));
        DRETURN(NULL);
    }

    /* add list into our remote task list */
    rt = lAddElemStr(&remote_task_list, RT_tid, "none", RT_Type);
    lSetHost(rt, RT_hostname, hostname);
    lSetUlong(rt, RT_state, RT_STATE_WAIT4ACK);

    rcv_from_execd(ctx, OPT_SYNCHRON, TAG_JOB_EXECUTION);

    tid = (sge_tid_t) lGetString(rt, RT_tid);

    if (strcmp(tid, "none") == 0) {
        tid = NULL;
        sprintf(lasterror, MSG_GDI_EXECDONHOSTDIDNTACCEPTTASK_S, hostname);
    }

    /* now close message to execd */
    cl_commlib_shutdown_handle(cl_com_get_handle("execd_handle", 0), false);

    DRETURN(tid);
}