/*----------------------------------------------------------------- * shadowd_exit_func * function installed to be called just before exit() is called. *-----------------------------------------------------------------*/ static void shadowd_exit_func( void **ctx_ref, int i ) { #if defined(SOLARIS) if (sge_smf_used() == 1) { /* We don't do disable on svcadm restart */ if (sge_strnullcmp(sge_smf_get_instance_state(), SCF_STATE_STRING_ONLINE) == 0 && sge_strnullcmp(sge_smf_get_instance_next_state(), SCF_STATE_STRING_NONE) == 0) { sge_smf_temporary_disable_instance(); } } #endif exit(i); }
/*************************** sge_smf_contract_fork() *************************** * NAME * sge_smf_contract_fork() -- fork() with child in new contract * * SYNOPSIS * int sge_smf_contract_fork(char *err_str, int err_length) * * FUNCTION * Once initialize useSMF ad return it's value * * INPUTS * char *err_str - error message * int err_length - max error message length * * RESULT * int - result * -1 - fork failed * -2 - contract_pre_fork failed * -3 - contract_post_fork failed * -4 - smflibs could not be loaded * result of fork() otherwise * * NOTES * MT-NOTES: sge_smf_contract_fork is not MT-safe * * SEE ALSO * *******************************************************************************/ int sge_smf_contract_fork(char *err_str, int err_length) { int pid; int ctfd; /* * EB: IMPORTANT: Logging in this function is not allowed * * Reason: This function is used in sge_peopen(). * * In multi threaded environments the child might run into a deadlock situation * if it tries to get a lock which was hold by one of the parent threads * before the fork() call. * * Therefore between fork() (in this function) and exec() only aync-signal-safe * functions are allowed. * * File based operations (like done in logging) are therefore not alled here. */ if (sge_smf_used() == 0) { pid=fork(); return pid; } else { /* Check if shared libs were loaded */ if (libsLoaded == 0) { snprintf(err_str, err_length, MSG_SMF_LOAD_LIB_FAILED); return -4; } /* Create new contract template */ ctfd = contracts_pre_fork(); if (ctfd == -1) { /* Could not create new contract template */ snprintf(err_str, err_length, MSG_SMF_CONTRACT_CREATE_FAILED); return -2; } else { pid = fork(); /* Dispose of the template and immediatelly abandon the contract */ pid = contracts_post_fork(ctfd, pid, err_str, err_length); if (pid == -2) { /* post_fork failed */ return -3; } } } return pid; }
/*------------------------------------------------------------- * Function installed to be called just before exit() is called. * clean up *-------------------------------------------------------------*/ static void execd_exit_func(void **ctx_ref, int i) { DENTER(TOP_LAYER, "execd_exit_func"); sge_gdi2_shutdown(ctx_ref); /* trigger load sensors shutdown */ sge_ls_stop(0); #ifdef COMPILE_DC ptf_stop(); #endif #if defined(SOLARIS) if (sge_smf_used() == 1) { /* We don't do disable on svcadm restart */ if (sge_strnullcmp(sge_smf_get_instance_state(), SCF_STATE_STRING_ONLINE) == 0 && sge_strnullcmp(sge_smf_get_instance_next_state(), SCF_STATE_STRING_NONE) == 0) { sge_smf_temporary_disable_instance(); } } #endif DEXIT; }
/****** qmaster/sge_qmaster_main/main() **************************************** * NAME * main() -- qmaster entry point * * SYNOPSIS * int main(int argc, char* argv[]) * * FUNCTION * Qmaster entry point. * * NOTE: The main thread must block all signals before any additional thread * is created. Failure to do so will ruin signal handling! * * INPUTS * int argc - number of commandline arguments * char* argv[] - commandline arguments * * RESULT * 0 - success * * NOTES * We check whether 'SGE_ROOT' is set before we daemonize. Once qmaster is * a daemon, we are no longer connected to a terminal and hence can not * output an error message to stdout or stderr. * * We need to inovke 'prepare_enroll()' *before* the user id is switched via * 'become_admin_user()'. This is because qmaster must be able to bind a so * called reserved port (requires root privileges) if configured to do so. * *******************************************************************************/ int main(int argc, char* argv[]) { int max_enroll_tries; int ret_val; int file_descriptor_settings_result = 0; bool has_daemonized = false; sge_gdi_ctx_class_t *ctx = NULL; u_long32 start_time = sge_get_gmt(); monitoring_t monitor; DENTER_MAIN(TOP_LAYER, "qmaster"); sge_monitor_init(&monitor, "MAIN", NONE_EXT, MT_WARNING, MT_ERROR); prof_mt_init(); sge_get_root_dir(true, NULL, 0, true); #ifdef __SGE_COMPILE_WITH_GETTEXT__ sge_init_language_func((gettext_func_type)gettext, (setlocale_func_type)setlocale, (bindtextdomain_func_type)bindtextdomain, (textdomain_func_type)textdomain); sge_init_language(NULL,NULL); #endif /* * qmaster doesn't support any commandline anymore, * but we should show version string and -help option */ if (argc != 1) { sigset_t sig_set; sigfillset(&sig_set); pthread_sigmask(SIG_SETMASK, &sig_set, NULL); sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, true); sge_process_qmaster_cmdline(argv); SGE_EXIT((void**)&ctx, 1); } /* * daemonize qmaster * set file descriptor limits * and initialize libraries to be used in multi threaded environment * also take care that finished child processed of this process become * zombie jobs */ has_daemonized = sge_daemonize_qmaster(); file_descriptor_settings_result = set_file_descriptor_limit(); #if !defined(INTERIX) && !defined(CYGWIN) init_sig_action_and_mask(); #endif /* init qmaster threads without becomming admin user */ sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, false); ctx->set_daemonized(ctx, has_daemonized); /* this must be done as root user to be able to bind ports < 1024 */ max_enroll_tries = 30; while (cl_com_get_handle(prognames[QMASTER],1) == NULL) { ctx->prepare_enroll(ctx); max_enroll_tries--; if (max_enroll_tries <= 0) { /* exit after 30 seconds */ CRITICAL((SGE_EVENT, MSG_QMASTER_COMMUNICATION_ERRORS )); SGE_EXIT((void**)&ctx, 1); } if (cl_com_get_handle(prognames[QMASTER],1) == NULL) { /* sleep when prepare_enroll() failed */ sleep(1); } } /* * now the commlib up and running. Set qmaster application status function * (commlib callback function for qping status information response * messages (SIRM)) */ ret_val = cl_com_set_status_func(sge_qmaster_application_status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val))); } /* * now we become admin user change into the correct root directory set the * the target for logging messages */ sge_become_admin_user(ctx->get_admin_user(ctx)); sge_chdir_exit(ctx->get_qmaster_spool_dir(ctx), 1); log_state_set_log_file(ERR_FILE); ctx->set_exit_func(ctx, sge_exit_func); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif /* * We do increment the heartbeat manually here. This is the 'startup heartbeat'. * The first time the hearbeat will be incremented through the heartbeat event * handler is after about HEARTBEAT_INTERVAL seconds. The hardbeat event handler * is setup during the initialisazion of the timer thread. */ inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, HEARTBEAT_INTERVAL, NULL); /* * Event master module has to be initialized already here because * sge_setup_qmaster() might already access it although event delivery * thread is not running. * * Corresponding shutdown is done in sge_event_master_terminate(); * * EB: In my opinion the init function should called in * sge_event_master_initialize(). Is it possible to move that call? */ sge_event_master_init(); sge_setup_qmaster(ctx, argv); #ifndef USE_POLL if (file_descriptor_settings_result == 1) { WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_LARGER_THAN_LIMIT_U, sge_u32c(FD_SETSIZE))); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE1_U, sge_u32c(FD_SETSIZE - 20))); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE2)); WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE3)); } #endif /* * Setup all threads and initialize corresponding modules. * Order is important! */ sge_signaler_initialize(ctx); sge_event_master_initialize(ctx); sge_timer_initialize(ctx, &monitor); sge_worker_initialize(ctx); #if 0 sge_test_initialize(ctx); #endif sge_listener_initialize(ctx); sge_scheduler_initialize(ctx, NULL); #ifndef NO_JNI sge_jvm_initialize(ctx, NULL); #endif INFO((SGE_EVENT, "qmaster startup took "sge_u32" seconds", sge_get_gmt() - start_time)); /* * Block till signal from signal thread arrives us */ sge_thread_wait_for_signal(); /* * Shutdown all threads and shutdown corresponding modules. * Order is important! */ #ifndef NO_JNI sge_jvm_terminate(ctx, NULL); #endif sge_scheduler_terminate(ctx, NULL); sge_listener_terminate(); #if 0 sge_test_terminate(ctx); #endif sge_worker_terminate(ctx); sge_timer_terminate(); sge_event_master_terminate(); sge_signaler_terminate(); /* * Remaining shutdown operations */ sge_clean_lists(); sge_monitor_free(&monitor); sge_shutdown((void**)&ctx, sge_qmaster_get_exit_state()); sge_prof_cleanup(); DEXIT; return 0; } /* main() */
/****** qmaster/sge_qmaster_main/signal_thread() ******************************* * NAME * signal_thread() -- signal thread function * * SYNOPSIS * void* signal_thread(void* anArg) * * FUNCTION * Signal handling thread function. Establish recognized signal set. Enter * signal wait loop. Wait for signal. Handle signal. * * If signal is 'SIGINT' or 'SIGTERM', kick-off shutdown and invalidate * signal thread. * * NOTE: The signal thread will terminate on return of this function. * * INPUTS * void* anArg - not used * * RESULT * void* - none * * NOTES * MT-NOTE: signal_thread() is a thread function. Do NOT use this function * MT-NOTE: in any other way! * *******************************************************************************/ void* sge_signaler_main(void* arg) { cl_thread_settings_t *thread_config = (cl_thread_settings_t*)arg; bool is_continue = true; sigset_t sig_set; int sig_num; time_t next_prof_output = 0; monitoring_t monitor; sge_gdi_ctx_class_t *ctx = NULL; DENTER(TOP_LAYER, "sge_signaler_main"); cl_thread_func_startup(thread_config); sge_monitor_init(&monitor, thread_config->thread_name, NONE_EXT, ST_WARNING, ST_ERROR); sge_qmaster_thread_init(&ctx, QMASTER, SIGNALER_THREAD, true); sigemptyset(&sig_set); sigaddset(&sig_set, SIGINT); sigaddset(&sig_set, SIGTERM); /* register at profiling module */ set_thread_name(pthread_self(), "Signal Thread"); conf_update_thread_profiling("Signal Thread"); while (is_continue) { /* * Wait for signals */ sigwait(&sig_set, &sig_num); thread_start_stop_profiling(); DPRINTF(("got signal %d\n", sig_num)); switch (sig_num) { case SIGINT: case SIGTERM: /* * Notify the main thread. It is waiting and will shutdown all other * threads if it gets signalled */ sge_thread_notify_all_waiting(); #if defined(SOLARIS) if (sge_smf_used() == 1) { /* We don't do disable on svcadm restart */ if (sge_strnullcmp(sge_smf_get_instance_state(), SCF_STATE_STRING_ONLINE) == 0 && sge_strnullcmp(sge_smf_get_instance_next_state(), SCF_STATE_STRING_NONE) == 0) { sge_smf_temporary_disable_instance(); } } #endif /* * Now this thread can exit. The main thread does the remaining * shutdown activities */ is_continue = false; break; default: ERROR((SGE_EVENT, MSG_QMASTER_UNEXPECTED_SIGNAL_I, sig_num)); } sge_monitor_output(&monitor); thread_output_profiling("signal thread profiling summary:\n", &next_prof_output); } sge_monitor_free(&monitor); DRETURN(NULL); }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }
/*-------------------------------------------------------------------------*/ int main(int argc, char **argv) { int ret; int my_pid; int ret_val; int printed_points = 0; int max_enroll_tries; static char tmp_err_file_name[SGE_PATH_MAX]; time_t next_prof_output = 0; int execd_exit_state = 0; lList **master_job_list = NULL; sge_gdi_ctx_class_t *ctx = NULL; lList *alp = NULL; DENTER_MAIN(TOP_LAYER, "execd"); #if defined(LINUX) gen_procList (); #endif prof_mt_init(); set_thread_name(pthread_self(),"Execd Thread"); prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ /* This needs a better solution */ umask(022); /* Initialize path for temporary logging until we chdir to spool */ my_pid = getpid(); sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid)); log_state_set_log_file(tmp_err_file_name); /* exit func for SGE_EXIT() */ sge_sig_handler_in_main_loop = 0; sge_setup_sig_handlers(EXECD); if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } ctx->set_exit_func(ctx, execd_exit_func); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif /* prepare daemonize */ if (!getenv("SGE_ND")) { sge_daemonize_prepare(ctx); } if ((ret=sge_occupy_first_three())>=0) { CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret)); SGE_EXIT((void**)&ctx, 1); } lInit(nmv); /* unset XAUTHORITY if set */ if (getenv("XAUTHORITY") != NULL) { sge_unsetenv("XAUTHORITY"); } parse_cmdline_execd(argv); /* exit if we can't get communication handle (bind port) */ max_enroll_tries = 30; while (cl_com_get_handle(prognames[EXECD],1) == NULL) { ctx->prepare_enroll(ctx); max_enroll_tries--; if (max_enroll_tries <= 0 || shut_me_down) { /* exit after 30 seconds */ if (printed_points != 0) { printf("\n"); } CRITICAL((SGE_EVENT, MSG_COM_ERROR)); SGE_EXIT((void**)&ctx, 1); } if (cl_com_get_handle(prognames[EXECD],1) == NULL) { /* sleep when prepare_enroll() failed */ sleep(1); if (max_enroll_tries < 27) { printf("."); printed_points++; fflush(stdout); } } } if (printed_points != 0) { printf("\n"); } /* * now the commlib up and running. Set execd application status function * ( commlib callback function for qping status information response * messages (SIRM) ) */ ret_val = cl_com_set_status_func(sge_execd_application_status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val)) ); } /* test connection */ { cl_com_SIRM_t* status = NULL; ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx), (char *)ctx->get_master(ctx, true), (char*)prognames[QMASTER], 1, &status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val))); ERROR((SGE_EVENT, MSG_CONF_NOCONFBG)); } cl_com_free_sirm_message(&status); } /* finalize daeamonize */ if (!getenv("SGE_ND")) { sge_daemonize_finalize(ctx); } /* daemonizes if qmaster is unreachable */ sge_setup_sge_execd(ctx, tmp_err_file_name); /* are we using qidle or not */ sge_ls_qidle(mconf_get_use_qidle()); sge_ls_gnu_ls(1); DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle())); /* test load sensor (internal or external) */ { lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); lFreeList(&report_list); } /* here we have to wait for qmaster registration */ while (sge_execd_register_at_qmaster(ctx, false) != 0) { if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) { /* This is no error */ DPRINTF(("***** got SGE_COM_ACCESS_DENIED from qmaster *****\n")); } if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) { execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE; break; } if (shut_me_down != 0) { break; } sleep(30); } /* * Terminate on SIGTERM or hard communication error */ if (execd_exit_state != 0 || shut_me_down != 0) { sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); } /* * We write pid file when we are connected to qmaster. Otherwise an old * execd might overwrite our pidfile. */ sge_write_pid(EXECD_PID_FILE); /* * At this point we are sure we are the only sge_execd and we are connected * to the current qmaster. First we have to report any reaped children * that might exist. */ starting_up(); /* * Log a warning message if execd hasn't been started by a superuser */ if (!sge_is_start_user_superuser()) { WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT)); } #ifdef COMPILE_DC if (ptf_init()) { CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF)); SGE_EXIT((void**)&ctx, 1); } INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF)); #endif master_job_list = object_type_get_master_list(SGE_TYPE_JOB); *master_job_list = lCreateList("Master_Job_List", JB_Type); job_list_read_from_disk(master_job_list, "Master_Job_List", 0, SPOOL_WITHIN_EXECD, job_initialize_job); /* clean up jobs hanging around (look in active_dir) */ clean_up_old_jobs(ctx, 1); execd_trash_load_report(); sge_set_flush_lr_flag(true); sge_sig_handler_in_main_loop = 1; if (thread_prof_active_by_id(pthread_self())) { prof_start(SGE_PROF_CUSTOM1, NULL); prof_start(SGE_PROF_CUSTOM2, NULL); prof_start(SGE_PROF_GDI_REQUEST, NULL); } else { prof_stop(SGE_PROF_CUSTOM1, NULL); prof_stop(SGE_PROF_CUSTOM2, NULL); prof_stop(SGE_PROF_GDI_REQUEST, NULL); } PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1); /* Start dispatching */ execd_exit_state = sge_execd_process_messages(ctx); /* * This code is only reached when dispatcher terminates and execd goes down. */ /* log if we received SIGPIPE signal */ if (sge_sig_handler_sigpipe_received) { sge_sig_handler_sigpipe_received = 0; INFO((SGE_EVENT, "SIGPIPE received\n")); } #if defined(LINUX) free_procList(); #endif lFreeList(master_job_list); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1); if (prof_is_active(SGE_PROF_ALL)) { time_t now = (time_t)sge_get_gmt(); if (now > next_prof_output) { prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n"); prof_reset(SGE_PROF_ALL,NULL); next_prof_output = now + 60; } } sge_prof_cleanup(); sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); }