/****** qmaster/sge_qmaster_heartbeat/increment_heartbeat() ************************* * NAME * increment_heartbeat() -- Event handler for heartbeat events * * SYNOPSIS * void increment_heartbeat(te_event_t anEvent) * * FUNCTION * Update qmaster heartbeat file. * * INPUTS * te_event_t anEvent - heartbeat event * * RESULT * void - none * * NOTES * MT-NOTE: increment_hearbeat() is NOT MT safe. This function is only * MT-NOTE: invoked from within the event delivery thread. * * We do assume that the system clock does NOT run backwards. However, we * do cope with a system clock which has been put back. * *******************************************************************************/ void increment_heartbeat(sge_gdi_ctx_class_t *ctx, te_event_t anEvent, monitoring_t *monitor) { int retval = 0; int heartbeat = 0; int check_act_qmaster_file = 0; char act_qmaster_name[CL_MAXHOSTLEN]; char act_resolved_qmaster_name[CL_MAXHOSTLEN]; char err_str[SGE_PATH_MAX+128]; const char *act_qmaster_file = ctx->get_act_qmaster_file(ctx); const char *qualified_hostname = ctx->get_qualified_hostname(ctx); DENTER(TOP_LAYER, "increment_heartbeat"); retval = inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30, &heartbeat); switch(retval) { case 0: { DPRINTF(("(heartbeat) - incremented (or created) heartbeat file: %s(beat=%d)\n", QMASTER_HEARTBEAT_FILE, heartbeat)); break; } default: { DPRINTF(("(heartbeat) - inc_qmaster_heartbeat() returned %d !!! (beat=%d)\n", retval, heartbeat)); check_act_qmaster_file = 1; break; } } if (heartbeat % 20 == 0) { DPRINTF(("(heartbeat) - checking act_qmaster file this time\n")); check_act_qmaster_file = 1; } if (check_act_qmaster_file == 1) { strcpy(err_str,""); if (get_qm_name(act_qmaster_name, act_qmaster_file, err_str) == 0) { /* got qmaster name */ if ( getuniquehostname(act_qmaster_name, act_resolved_qmaster_name, 0) == CL_RETVAL_OK && sge_hostcmp(act_resolved_qmaster_name, qualified_hostname) != 0 ) { /* act_qmaster file has been changed */ WARNING((SGE_EVENT, SFNMAX, MSG_HEART_ACT_QMASTER_FILE_CHANGED)); if (sge_qmaster_shutdown_via_signal_thread(100) != 0) { ERROR((SGE_EVENT, SFNMAX, MSG_HEART_CANT_SIGNAL)); /* TODO: here the ctx reference is not transported back ** event_handler functions should use &ctx instead */ sge_shutdown((void**)&ctx, 1); } } else { DPRINTF(("(heartbeat) - act_qmaster file contains hostname "SFQ"\n", act_qmaster_name)); } } else { WARNING((SGE_EVENT, MSG_HEART_CANNOT_READ_FILE_S, err_str )); } } DEXIT; return; } /* increment_heartbeat() */
/****** qmaster/sge_qmaster_main/main() **************************************** * NAME * main() -- qmaster entry point * * SYNOPSIS * int main(int argc, char* argv[]) * * FUNCTION * Qmaster entry point. * * NOTE: The main thread must block all signals before any additional thread * is created. Failure to do so will ruin signal handling! * * INPUTS * int argc - number of commandline arguments * char* argv[] - commandline arguments * * RESULT * 0 - success * * NOTES * We check whether 'SGE_ROOT' is set before we daemonize. Once qmaster is * a daemon, we are no longer connected to a terminal and hence can not * output an error message to stdout or stderr. * * We need to inovke 'prepare_enroll()' *before* the user id is switched via * 'become_admin_user()'. This is because qmaster must be able to bind a so * called reserved port (requires root privileges) if configured to do so. * *******************************************************************************/ int main(int argc, char* argv[]) { int max_enroll_tries; int ret_val; int file_descriptor_settings_result = 0; bool has_daemonized = false; sge_gdi_ctx_class_t *ctx = NULL; u_long32 start_time = sge_get_gmt(); monitoring_t monitor; DENTER_MAIN(TOP_LAYER, "qmaster"); sge_monitor_init(&monitor, "MAIN", NONE_EXT, MT_WARNING, MT_ERROR); prof_mt_init(); sge_get_root_dir(true, NULL, 0, true); #ifdef __SGE_COMPILE_WITH_GETTEXT__ sge_init_language_func((gettext_func_type)gettext, (setlocale_func_type)setlocale, (bindtextdomain_func_type)bindtextdomain, (textdomain_func_type)textdomain); sge_init_language(NULL,NULL); #endif /* * qmaster doesn't support any commandline anymore, * but we should show version string and -help option */ if (argc != 1) { sigset_t sig_set; sigfillset(&sig_set); pthread_sigmask(SIG_SETMASK, &sig_set, NULL); sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, true); sge_process_qmaster_cmdline(argv); SGE_EXIT((void**)&ctx, 1); } /* * daemonize qmaster * set file descriptor limits * and initialize libraries to be used in multi threaded environment * also take care that finished child processed of this process become * zombie jobs */ has_daemonized = sge_daemonize_qmaster(); file_descriptor_settings_result = set_file_descriptor_limit(); #if !defined(INTERIX) && !defined(CYGWIN) init_sig_action_and_mask(); #endif /* init qmaster threads without becomming admin user */ sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, false); ctx->set_daemonized(ctx, has_daemonized); /* this must be done as root user to be able to bind ports < 1024 */ max_enroll_tries = 30; while (cl_com_get_handle(prognames[QMASTER],1) == NULL) { ctx->prepare_enroll(ctx); max_enroll_tries--; if (max_enroll_tries <= 0) { /* exit after 30 seconds */ CRITICAL((SGE_EVENT, MSG_QMASTER_COMMUNICATION_ERRORS )); SGE_EXIT((void**)&ctx, 1); } if (cl_com_get_handle(prognames[QMASTER],1) == NULL) { /* sleep when prepare_enroll() failed */ sleep(1); } } /* * now the commlib up and running. Set qmaster application status function * (commlib callback function for qping status information response * messages (SIRM)) */ ret_val = cl_com_set_status_func(sge_qmaster_application_status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val))); } /* * now we become admin user change into the correct root directory set the * the target for logging messages */ sge_become_admin_user(ctx->get_admin_user(ctx)); sge_chdir_exit(ctx->get_qmaster_spool_dir(ctx), 1); log_state_set_log_file(ERR_FILE); ctx->set_exit_func(ctx, sge_exit_func); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif /* * We do increment the heartbeat manually here. This is the 'startup heartbeat'. * The first time the hearbeat will be incremented through the heartbeat event * handler is after about HEARTBEAT_INTERVAL seconds. The hardbeat event handler * is setup during the initialisazion of the timer thread. */ inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, HEARTBEAT_INTERVAL, NULL); /* * Event master module has to be initialized already here because * sge_setup_qmaster() might already access it although event delivery * thread is not running. * * Corresponding shutdown is done in sge_event_master_terminate(); * * EB: In my opinion the init function should called in * sge_event_master_initialize(). Is it possible to move that call? */ sge_event_master_init(); sge_setup_qmaster(ctx, argv); #ifndef USE_POLL if (file_descriptor_settings_result == 1) { WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_LARGER_THAN_LIMIT_U, sge_u32c(FD_SETSIZE))); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE1_U, sge_u32c(FD_SETSIZE - 20))); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE2)); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE3)); } #endif /* * Setup all threads and initialize corresponding modules. * Order is important! */ sge_signaler_initialize(ctx); sge_event_master_initialize(ctx); sge_timer_initialize(ctx, &monitor); sge_worker_initialize(ctx); #if 0 sge_test_initialize(ctx); #endif sge_listener_initialize(ctx); sge_scheduler_initialize(ctx, NULL); #ifndef NO_JNI sge_jvm_initialize(ctx, NULL); #endif INFO((SGE_EVENT, "qmaster startup took "sge_u32" seconds", sge_get_gmt() - start_time)); /* * Block till signal from signal thread arrives us */ sge_thread_wait_for_signal(); /* * Shutdown all threads and shutdown corresponding modules. * Order is important! */ #ifndef NO_JNI sge_jvm_terminate(ctx, NULL); #endif sge_scheduler_terminate(ctx, NULL); sge_listener_terminate(); #if 0 sge_test_terminate(ctx); #endif sge_worker_terminate(ctx); sge_timer_terminate(); sge_event_master_terminate(); sge_signaler_terminate(); /* * Remaining shutdown operations */ sge_clean_lists(); sge_monitor_free(&monitor); sge_shutdown((void**)&ctx, sge_qmaster_get_exit_state()); sge_prof_cleanup(); DEXIT; return 0; } /* main() */
int main(int argc, char* argv[]) { int return_value = 0; int i; int runs = 0; char* filename = "test.txt"; int timeout = 15; int todo = 0; struct timeval now; struct timeval last_time; int do_stop = 0; int beat_val; int only_write = 0; DENTER_MAIN(TOP_LAYER, "test_sge_qmaster_heartbeat"); /* initialize last_time */ gettimeofday(&last_time, NULL); if (argc==3) { if (strcmp(argv[1],"-only-write") == 0) { printf("only writing heartbeat file once\n"); only_write=1; filename = argv[2]; } } if ( only_write == 0) { /* delete file */ unlink(filename); } /* now run till we start from 1 */ while ( do_stop == 0 ) { return_value = inc_qmaster_heartbeat(filename, timeout, &beat_val); i = get_qmaster_heartbeat(filename, timeout); if ( only_write == 1) { printf("incremented heartbeat file %s\n", filename); printf("heartbeat value is %d\n", i); exit(0); } todo++; if (beat_val != i) { printf("heartbeat value not correct\n"); do_stop = 1; return_value = 20; } if (i <= 0) { printf("get_qmaster_heartbeat() returned %d\n", i); return_value = -100 + i; } else { if ( return_value != 0) { printf("(%d) inc_qmaster_heartbeat() returned %d\n", i, return_value); } } /* on error: * * exit value > 100: get_qmaster_hearbeat() returned: - (exit value - 100) * exit value < 100: inc_qmaster_heartbeat() returned: - (exit value) * exit value == 20: unexpected heartbeat value */ if (return_value != 0) { unlink(filename); DEXIT; return (-return_value); } if (i==1 && runs++ != 0) { do_stop = 1; } gettimeofday(&now,NULL); if (now.tv_sec != last_time.tv_sec || do_stop != 0 ) { printf("%6.2f %% done\n", (double)(((double)todo/99999.0)*100.0)); fflush(stdout); last_time.tv_sec = now.tv_sec; } } /* delete file */ unlink(filename); DEXIT; return 0; } /* main() */