/****** qmaster/sge_qmaster_heartbeat/increment_heartbeat() *************************
*  NAME
*     increment_heartbeat() -- Event handler for heartbeat events
*
*  SYNOPSIS
*     void increment_heartbeat(te_event_t anEvent) 
*
*  FUNCTION
*     Update qmaster heartbeat file.
*
*  INPUTS
*     te_event_t anEvent - heartbeat event 
*
*  RESULT
*     void - none 
*
*  NOTES
*     MT-NOTE: increment_hearbeat() is NOT MT safe. This function is only
*     MT-NOTE: invoked from within the event delivery thread.
*
*     We do assume that the system clock does NOT run backwards. However, we
*     do cope with a system clock which has been put back.
*
*******************************************************************************/
void 
increment_heartbeat(sge_gdi_ctx_class_t *ctx, te_event_t anEvent, monitoring_t *monitor)
{
   int retval = 0;
   int heartbeat = 0;
   int check_act_qmaster_file = 0;
   char act_qmaster_name[CL_MAXHOSTLEN];
   char act_resolved_qmaster_name[CL_MAXHOSTLEN];
   char err_str[SGE_PATH_MAX+128];
   const char *act_qmaster_file = ctx->get_act_qmaster_file(ctx);
   const char *qualified_hostname = ctx->get_qualified_hostname(ctx);

   DENTER(TOP_LAYER, "increment_heartbeat");

   retval = inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30, &heartbeat);

   switch(retval) {
      case 0: {
         DPRINTF(("(heartbeat) - incremented (or created) heartbeat file: %s(beat=%d)\n", QMASTER_HEARTBEAT_FILE, heartbeat));
         break;
      }
      default: {
         DPRINTF(("(heartbeat) - inc_qmaster_heartbeat() returned %d !!! (beat=%d)\n", retval, heartbeat));
         check_act_qmaster_file = 1;
         break;
      }
   }

   if (heartbeat % 20 == 0) {
      DPRINTF(("(heartbeat) - checking act_qmaster file this time\n"));
      check_act_qmaster_file = 1;
   }

   if (check_act_qmaster_file == 1) {
      strcpy(err_str,"");
      if (get_qm_name(act_qmaster_name, act_qmaster_file, err_str) == 0) {
         /* got qmaster name */
         if ( getuniquehostname(act_qmaster_name, act_resolved_qmaster_name, 0) == CL_RETVAL_OK &&
              sge_hostcmp(act_resolved_qmaster_name, qualified_hostname) != 0      ) {
            /* act_qmaster file has been changed */
            WARNING((SGE_EVENT, SFNMAX, MSG_HEART_ACT_QMASTER_FILE_CHANGED));
            if (sge_qmaster_shutdown_via_signal_thread(100) != 0) {
               ERROR((SGE_EVENT, SFNMAX, MSG_HEART_CANT_SIGNAL));
               /* TODO: here the ctx reference is not transported back
               **       event_handler functions should use &ctx instead
               */
               sge_shutdown((void**)&ctx, 1);
            }
         } else {
            DPRINTF(("(heartbeat) - act_qmaster file contains hostname "SFQ"\n", act_qmaster_name));
         }
      } else {
         WARNING((SGE_EVENT, MSG_HEART_CANNOT_READ_FILE_S, err_str ));
      }
   }

   DEXIT;
   return;
} /* increment_heartbeat() */
/****** qmaster/sge_qmaster_main/main() ****************************************
*  NAME
*     main() -- qmaster entry point 
*
*  SYNOPSIS
*     int main(int argc, char* argv[]) 
*
*  FUNCTION
*     Qmaster entry point.
*
*     NOTE: The main thread must block all signals before any additional thread
*     is created. Failure to do so will ruin signal handling!
*
*  INPUTS
*     int argc     - number of commandline arguments 
*     char* argv[] - commandline arguments 
*
*  RESULT
*     0 - success 
*
*  NOTES
*     We check whether 'SGE_ROOT' is set before we daemonize. Once qmaster is
*     a daemon, we are no longer connected to a terminal and hence can not
*     output an error message to stdout or stderr.
*
*     We need to inovke 'prepare_enroll()' *before* the user id is switched via
*     'become_admin_user()'. This is because qmaster must be able to bind a so
*     called reserved port (requires root privileges) if configured to do so.
*
*******************************************************************************/
int main(int argc, char* argv[])
{
   int max_enroll_tries;
   int ret_val;
   int file_descriptor_settings_result = 0;
   bool has_daemonized = false;
   sge_gdi_ctx_class_t *ctx = NULL;
   u_long32 start_time = sge_get_gmt();
   monitoring_t monitor;

   DENTER_MAIN(TOP_LAYER, "qmaster");

   sge_monitor_init(&monitor, "MAIN", NONE_EXT, MT_WARNING, MT_ERROR);
   prof_mt_init();

   sge_get_root_dir(true, NULL, 0, true);
   
#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   sge_init_language_func((gettext_func_type)gettext, (setlocale_func_type)setlocale, (bindtextdomain_func_type)bindtextdomain, (textdomain_func_type)textdomain);
   sge_init_language(NULL,NULL);   
#endif 

   /* 
    * qmaster doesn't support any commandline anymore,
    * but we should show version string and -help option 
    */
   if (argc != 1) {
      sigset_t sig_set;
      sigfillset(&sig_set);
      pthread_sigmask(SIG_SETMASK, &sig_set, NULL);
      sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, true);
      sge_process_qmaster_cmdline(argv);
      SGE_EXIT((void**)&ctx, 1);
   }

   /*
    * daemonize qmaster
    * set file descriptor limits
    * and initialize libraries to be used in multi threaded environment
    * also take care that finished child processed of this process become
    * zombie jobs
    */
   has_daemonized = sge_daemonize_qmaster();
   file_descriptor_settings_result = set_file_descriptor_limit();
#if !defined(INTERIX) && !defined(CYGWIN)
   init_sig_action_and_mask();
#endif

   /* init qmaster threads without becomming admin user */
   sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, false);

   ctx->set_daemonized(ctx, has_daemonized);

   /* this must be done as root user to be able to bind ports < 1024 */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;
      if (max_enroll_tries <= 0) {
         /* exit after 30 seconds */
         CRITICAL((SGE_EVENT, MSG_QMASTER_COMMUNICATION_ERRORS ));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
      }
   }

   /*
    * now the commlib up and running. Set qmaster application status function 
    * (commlib callback function for qping status information response 
    *  messages (SIRM))
    */
   ret_val = cl_com_set_status_func(sge_qmaster_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
   }

   /* 
    * now we become admin user change into the correct root directory set the
    * the target for logging messages
    */
   sge_become_admin_user(ctx->get_admin_user(ctx));
   sge_chdir_exit(ctx->get_qmaster_spool_dir(ctx), 1);
   log_state_set_log_file(ERR_FILE);
   ctx->set_exit_func(ctx, sge_exit_func);

#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /*
    * We do increment the heartbeat manually here. This is the 'startup heartbeat'. 
    * The first time the hearbeat will be incremented through the heartbeat event 
    * handler is after about HEARTBEAT_INTERVAL seconds. The hardbeat event handler
    * is setup during the initialisazion of the timer thread.
    */
   inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, HEARTBEAT_INTERVAL, NULL);
     
   /*
    * Event master module has to be initialized already here because
    * sge_setup_qmaster() might already access it although event delivery
    * thread is not running.
    *
    * Corresponding shutdown is done in sge_event_master_terminate();
    *
    * EB: In my opinion the init function should called in
    * sge_event_master_initialize(). Is it possible to move that call?
    */ 
   sge_event_master_init();

   sge_setup_qmaster(ctx, argv);

#ifndef USE_POLL
   if (file_descriptor_settings_result == 1) {
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_LARGER_THAN_LIMIT_U, sge_u32c(FD_SETSIZE)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE1_U, sge_u32c(FD_SETSIZE - 20)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE2));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE3));
   }
#endif

   /*
    * Setup all threads and initialize corresponding modules. 
    * Order is important!
    */
   sge_signaler_initialize(ctx);
   sge_event_master_initialize(ctx);
   sge_timer_initialize(ctx, &monitor);
   sge_worker_initialize(ctx);
#if 0
   sge_test_initialize(ctx);
#endif
   sge_listener_initialize(ctx);
   sge_scheduler_initialize(ctx, NULL);
#ifndef NO_JNI
   sge_jvm_initialize(ctx, NULL);
#endif

   INFO((SGE_EVENT, "qmaster startup took "sge_u32" seconds", sge_get_gmt() - start_time));

   /*
    * Block till signal from signal thread arrives us
    */
   sge_thread_wait_for_signal();

   /* 
    * Shutdown all threads and shutdown corresponding modules.
    * Order is important!
    */
#ifndef NO_JNI
   sge_jvm_terminate(ctx, NULL);
#endif
   sge_scheduler_terminate(ctx, NULL);
   sge_listener_terminate();
#if 0
   sge_test_terminate(ctx);
#endif
   sge_worker_terminate(ctx);
   sge_timer_terminate();
   sge_event_master_terminate();
   sge_signaler_terminate();

   /*
    * Remaining shutdown operations
    */
   sge_clean_lists();
   sge_monitor_free(&monitor);

   sge_shutdown((void**)&ctx, sge_qmaster_get_exit_state());
   sge_prof_cleanup();

   DEXIT;
   return 0;
} /* main() */
Exemple #3
0
int main(int argc, char* argv[])
{
   int return_value = 0;
   int i;
   int runs = 0;
   char* filename = "test.txt";
   int   timeout  = 15;
   int todo = 0;
   struct timeval now;
   struct timeval last_time;
   int do_stop = 0;
   int beat_val;
   int only_write = 0;

   DENTER_MAIN(TOP_LAYER, "test_sge_qmaster_heartbeat");

   /* initialize last_time */
   gettimeofday(&last_time, NULL);

   if (argc==3) {
      if (strcmp(argv[1],"-only-write") == 0) {
         printf("only writing heartbeat file once\n");
         only_write=1;
         filename = argv[2];
      }
   }

   if ( only_write == 0) {
      /* delete file */
      unlink(filename);
   }
   
   /* now run till we start from 1 */
   while ( do_stop == 0 ) {
      return_value = inc_qmaster_heartbeat(filename, timeout, &beat_val);
      i            = get_qmaster_heartbeat(filename, timeout);
      if ( only_write == 1) {
         printf("incremented heartbeat file %s\n", filename);
         printf("heartbeat value is %d\n", i);
         exit(0);
      }

      todo++;
      if (beat_val != i) {
         printf("heartbeat value not correct\n");
         do_stop = 1;
         return_value = 20;
      }

      if (i <= 0) {
         printf("get_qmaster_heartbeat() returned %d\n", i);
         return_value = -100 + i;
      } else {
         if ( return_value != 0) {
            printf("(%d) inc_qmaster_heartbeat() returned %d\n", i, return_value);
         }
      }

      /* on error:  
       *
       * exit value > 100:   get_qmaster_hearbeat() returned:   - (exit value - 100)
       * exit value < 100:   inc_qmaster_heartbeat() returned:  - (exit value)
       * exit value == 20:   unexpected heartbeat value
       */
      if (return_value != 0) {
         unlink(filename);
         DEXIT;
         return (-return_value);
      }
      if (i==1 && runs++ != 0) {
         do_stop = 1;
      }
      gettimeofday(&now,NULL);
      if (now.tv_sec != last_time.tv_sec || do_stop != 0 ) {
         printf("%6.2f %% done\n", (double)(((double)todo/99999.0)*100.0));
         fflush(stdout);
         last_time.tv_sec = now.tv_sec;
      }
   }
   
   /* delete file */
   unlink(filename);

   DEXIT;
   return 0;
} /* main() */