Esempio n. 1
0
static int shadowd_is_old_master_enrolled(int sge_test_heartbeat, int sge_qmaster_port, char *oldqmaster)
{
   cl_com_handle_t* handle = NULL;
   cl_com_SIRM_t* status = NULL;
   int ret;
   int is_up_and_running = 0;
   int commlib_error = CL_RETVAL_OK;

   DENTER(TOP_LAYER, "shadowd_is_old_master_enrolled");

   /*
    * This is for testsuite testing to simulate qmaster outage.
    * For testing the environment variable SGE_TEST_HEARTBEAT_TIMEOUT
    * has to be set!
    */
   if (sge_test_heartbeat > 0) {
      DRETURN(is_up_and_running);
   }

   handle=cl_com_create_handle(&commlib_error, CL_CT_TCP, CL_CM_CT_MESSAGE, CL_FALSE, sge_qmaster_port, CL_TCP_DEFAULT,(char*)prognames[SHADOWD] , 0, 1,0 );
   if (handle == NULL) {
      CRITICAL((SGE_EVENT, SFNMAX, cl_get_error_text(commlib_error)));
      DRETURN(is_up_and_running);
   }

   DPRINTF(("Try to send status information message to previous master host "SFQ" to port %ld\n", oldqmaster, sge_qmaster_port));
   ret = cl_commlib_get_endpoint_status(handle,oldqmaster ,(char*)prognames[QMASTER] , 1, &status);
   if (ret != CL_RETVAL_OK) {
      DPRINTF(("cl_commlib_get_endpoint_status() returned "SFQ"\n", cl_get_error_text(ret)));
      is_up_and_running = 0;
      DPRINTF(("old qmaster not responding - No master found\n"));   
   } else {
      DPRINTF(("old qmaster is still running\n"));   
      is_up_and_running = 1;
   }

   if (status != NULL) {
      DPRINTF(("endpoint is up since %ld seconds and has status %ld\n", status->runtime, status->application_status));
      cl_com_free_sirm_message(&status);
   }
 
   cl_commlib_shutdown_handle(handle,CL_FALSE);

   DRETURN(is_up_and_running);
}
Esempio n. 2
0
/*-------------------------------------------------------------------------*/
int main(int argc, char **argv)
{
   int ret;
   int my_pid;
   int ret_val;
   int printed_points = 0;
   int max_enroll_tries;
   static char tmp_err_file_name[SGE_PATH_MAX];
   time_t next_prof_output = 0;
   int execd_exit_state = 0;
   lList **master_job_list = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;
   lList *alp = NULL;

   DENTER_MAIN(TOP_LAYER, "execd");

#if defined(LINUX)
   gen_procList ();
#endif

   prof_mt_init();

   set_thread_name(pthread_self(),"Execd Thread");

   prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); 
   prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); 

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   /* This needs a better solution */
   umask(022);
      
   /* Initialize path for temporary logging until we chdir to spool */
   my_pid = getpid();
   sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid));
   log_state_set_log_file(tmp_err_file_name);

   /* exit func for SGE_EXIT() */
   sge_sig_handler_in_main_loop = 0;
   sge_setup_sig_handlers(EXECD);

   if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }
   ctx->set_exit_func(ctx, execd_exit_func);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /* prepare daemonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_prepare(ctx);
   }

   if ((ret=sge_occupy_first_three())>=0) {
      CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret));
      SGE_EXIT((void**)&ctx, 1);
   }

   lInit(nmv);

   /* unset XAUTHORITY if set */
   if (getenv("XAUTHORITY") != NULL) {
      sge_unsetenv("XAUTHORITY");
   }

   parse_cmdline_execd(argv);   
   
   /* exit if we can't get communication handle (bind port) */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[EXECD],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;

      if (max_enroll_tries <= 0 || shut_me_down) {
         /* exit after 30 seconds */
         if (printed_points != 0) {
            printf("\n");
         }
         CRITICAL((SGE_EVENT, MSG_COM_ERROR));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[EXECD],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
        if (max_enroll_tries < 27) {
           printf(".");
           printed_points++;
           fflush(stdout);
        }
      }
   }

   if (printed_points != 0) {
      printf("\n");
   }

   /*
    * now the commlib up and running. Set execd application status function 
    * ( commlib callback function for qping status information response 
    *   messages (SIRM) )
    */
   ret_val = cl_com_set_status_func(sge_execd_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)) );
   }

   /* test connection */
   {
      cl_com_SIRM_t* status = NULL;
      ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx),
                                               (char *)ctx->get_master(ctx, true),
                                               (char*)prognames[QMASTER], 1, &status);
      if (ret_val != CL_RETVAL_OK) {
         ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
         ERROR((SGE_EVENT, MSG_CONF_NOCONFBG));
      }
      cl_com_free_sirm_message(&status);
   }
   
   /* finalize daeamonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_finalize(ctx);
   }

   /* daemonizes if qmaster is unreachable */   
   sge_setup_sge_execd(ctx, tmp_err_file_name);

   /* are we using qidle or not */
   sge_ls_qidle(mconf_get_use_qidle());
   sge_ls_gnu_ls(1);
   
   DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle()));

   /* test load sensor (internal or external) */
   {
      lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx));
      lFreeList(&report_list);
   }
   
   /* here we have to wait for qmaster registration */
   while (sge_execd_register_at_qmaster(ctx, false) != 0) {
      if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) {
         /* This is no error */
         DPRINTF(("*****  got SGE_COM_ACCESS_DENIED from qmaster  *****\n"));
      }
      if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) {
         execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE;
         break;
      }
      if (shut_me_down != 0) {
         break;
      }
      sleep(30);
   }

   /* 
    * Terminate on SIGTERM or hard communication error
    */
   if (execd_exit_state != 0 || shut_me_down != 0) {
      sge_shutdown((void**)&ctx, execd_exit_state);
      DRETURN(execd_exit_state);
   }

   /*
    * We write pid file when we are connected to qmaster. Otherwise an old
    * execd might overwrite our pidfile.
    */
   sge_write_pid(EXECD_PID_FILE);

   /*
    * At this point we are sure we are the only sge_execd and we are connected
    * to the current qmaster. First we have to report any reaped children
    * that might exist.
    */
   starting_up();

   /*
    * Log a warning message if execd hasn't been started by a superuser
    */
   if (!sge_is_start_user_superuser()) {
      WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT));
   }   

#ifdef COMPILE_DC
   if (ptf_init()) {
      CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF));
      SGE_EXIT((void**)&ctx, 1);
   }
   INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF));
#endif

   master_job_list = object_type_get_master_list(SGE_TYPE_JOB);
   *master_job_list = lCreateList("Master_Job_List", JB_Type);
   job_list_read_from_disk(master_job_list, "Master_Job_List",
                           0, SPOOL_WITHIN_EXECD, 
                          job_initialize_job);
   

   /* clean up jobs hanging around (look in active_dir) */
   clean_up_old_jobs(ctx, 1);
   execd_trash_load_report();
   sge_set_flush_lr_flag(true);

   sge_sig_handler_in_main_loop = 1;

   if (thread_prof_active_by_id(pthread_self())) {
      prof_start(SGE_PROF_CUSTOM1, NULL);
      prof_start(SGE_PROF_CUSTOM2, NULL);
      prof_start(SGE_PROF_GDI_REQUEST, NULL);
   } else {
      prof_stop(SGE_PROF_CUSTOM1, NULL);
      prof_stop(SGE_PROF_CUSTOM2, NULL);
      prof_stop(SGE_PROF_GDI_REQUEST, NULL);
   }

   PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1);

   /* Start dispatching */
   execd_exit_state = sge_execd_process_messages(ctx);


   /*
    * This code is only reached when dispatcher terminates and execd goes down.
    */

   /* log if we received SIGPIPE signal */
   if (sge_sig_handler_sigpipe_received) {
       sge_sig_handler_sigpipe_received = 0;
       INFO((SGE_EVENT, "SIGPIPE received\n"));
   }

#if defined(LINUX)
   free_procList();
#endif
   lFreeList(master_job_list);

   PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1);
   if (prof_is_active(SGE_PROF_ALL)) {
     time_t now = (time_t)sge_get_gmt();

      if (now > next_prof_output) {
         prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n");
         prof_reset(SGE_PROF_ALL,NULL);
         next_prof_output = now + 60;
      }
   }
   sge_prof_cleanup();

   sge_shutdown((void**)&ctx, execd_exit_state);
   DRETURN(execd_exit_state);
}