/****** qmaster/setup_qmaster/sge_setup_qmaster() ****************************** * NAME * sge_setup_qmaster() -- setup qmaster * * SYNOPSIS * int sge_setup_qmaster(char* anArgv[]) * * FUNCTION * Process commandline arguments. Remove qmaster lock file. Write qmaster * host to the 'act_qmaster' file. Initialize qmaster and reporting. Write * qmaster PID file. * * NOTE: Before this function is invoked, qmaster must become admin user. * * INPUTS * char* anArgv[] - commandline argument vector * * RESULT * 0 - success * * NOTES * MT-NOTE: sge_setup_qmaster() is NOT MT safe! * MT-NOTE: * MT-NOTE: This function must be called exclusively, with the qmaster main * MT-NOTE: thread being the *only* active thread. In other words, do not * MT-NOTE: invoke this function after any additional thread (directly or * MT-NOTE: indirectly) has been created. * * Do *not* write the qmaster pid file, before 'qmaster_init()' did return * successfully. Otherwise, if we do have a running qmaster and a second * qmaster is started (illegally) on the same host, the second qmaster will * overwrite the pid of the qmaster started first. The second qmaster will * detect it's insubordinate doing and terminate itself, thus leaving behind * a useless pid. * *******************************************************************************/ int sge_setup_qmaster(sge_gdi_ctx_class_t *ctx, char* anArgv[]) { char err_str[1024]; const char *qualified_hostname = ctx->get_qualified_hostname(ctx); const char *act_qmaster_file = ctx->get_act_qmaster_file(ctx); DENTER(TOP_LAYER, "sge_setup_qmaster"); umask(022); /* this needs a better solution */ process_cmdline(anArgv); INFO((SGE_EVENT, SFNMAX, MSG_STARTUP_BEGINWITHSTARTUP)); qmaster_unlock(QMASTER_LOCK_FILE); if (write_qm_name(qualified_hostname, act_qmaster_file, err_str)) { ERROR((SGE_EVENT, "%s\n", err_str)); SGE_EXIT(NULL, 1); } qmaster_init(ctx, anArgv); sge_write_pid(QMASTER_PID_FILE); DEXIT; return 0; } /* sge_setup_qmaster() */
static void qevent_dump_pid_file(void) { sge_write_pid("qevent.pid"); }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }
/*-------------------------------------------------------------------------*/ int main(int argc, char **argv) { int ret; int my_pid; int ret_val; int printed_points = 0; int max_enroll_tries; static char tmp_err_file_name[SGE_PATH_MAX]; time_t next_prof_output = 0; int execd_exit_state = 0; lList **master_job_list = NULL; sge_gdi_ctx_class_t *ctx = NULL; lList *alp = NULL; DENTER_MAIN(TOP_LAYER, "execd"); #if defined(LINUX) gen_procList (); #endif prof_mt_init(); set_thread_name(pthread_self(),"Execd Thread"); prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ /* This needs a better solution */ umask(022); /* Initialize path for temporary logging until we chdir to spool */ my_pid = getpid(); sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid)); log_state_set_log_file(tmp_err_file_name); /* exit func for SGE_EXIT() */ sge_sig_handler_in_main_loop = 0; sge_setup_sig_handlers(EXECD); if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } ctx->set_exit_func(ctx, execd_exit_func); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif /* prepare daemonize */ if (!getenv("SGE_ND")) { sge_daemonize_prepare(ctx); } if ((ret=sge_occupy_first_three())>=0) { CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret)); SGE_EXIT((void**)&ctx, 1); } lInit(nmv); /* unset XAUTHORITY if set */ if (getenv("XAUTHORITY") != NULL) { sge_unsetenv("XAUTHORITY"); } parse_cmdline_execd(argv); /* exit if we can't get communication handle (bind port) */ max_enroll_tries = 30; while (cl_com_get_handle(prognames[EXECD],1) == NULL) { ctx->prepare_enroll(ctx); max_enroll_tries--; if (max_enroll_tries <= 0 || shut_me_down) { /* exit after 30 seconds */ if (printed_points != 0) { printf("\n"); } CRITICAL((SGE_EVENT, MSG_COM_ERROR)); SGE_EXIT((void**)&ctx, 1); } if (cl_com_get_handle(prognames[EXECD],1) == NULL) { /* sleep when prepare_enroll() failed */ sleep(1); if (max_enroll_tries < 27) { printf("."); printed_points++; fflush(stdout); } } } if (printed_points != 0) { printf("\n"); } /* * now the commlib up and running. Set execd application status function * ( commlib callback function for qping status information response * messages (SIRM) ) */ ret_val = cl_com_set_status_func(sge_execd_application_status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val)) ); } /* test connection */ { cl_com_SIRM_t* status = NULL; ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx), (char *)ctx->get_master(ctx, true), (char*)prognames[QMASTER], 1, &status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val))); ERROR((SGE_EVENT, MSG_CONF_NOCONFBG)); } cl_com_free_sirm_message(&status); } /* finalize daeamonize */ if (!getenv("SGE_ND")) { sge_daemonize_finalize(ctx); } /* daemonizes if qmaster is unreachable */ sge_setup_sge_execd(ctx, tmp_err_file_name); /* are we using qidle or not */ sge_ls_qidle(mconf_get_use_qidle()); sge_ls_gnu_ls(1); DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle())); /* test load sensor (internal or external) */ { lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); lFreeList(&report_list); } /* here we have to wait for qmaster registration */ while (sge_execd_register_at_qmaster(ctx, false) != 0) { if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) { /* This is no error */ DPRINTF(("***** got SGE_COM_ACCESS_DENIED from qmaster *****\n")); } if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) { execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE; break; } if (shut_me_down != 0) { break; } sleep(30); } /* * Terminate on SIGTERM or hard communication error */ if (execd_exit_state != 0 || shut_me_down != 0) { sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); } /* * We write pid file when we are connected to qmaster. Otherwise an old * execd might overwrite our pidfile. */ sge_write_pid(EXECD_PID_FILE); /* * At this point we are sure we are the only sge_execd and we are connected * to the current qmaster. First we have to report any reaped children * that might exist. */ starting_up(); /* * Log a warning message if execd hasn't been started by a superuser */ if (!sge_is_start_user_superuser()) { WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT)); } #ifdef COMPILE_DC if (ptf_init()) { CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF)); SGE_EXIT((void**)&ctx, 1); } INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF)); #endif master_job_list = object_type_get_master_list(SGE_TYPE_JOB); *master_job_list = lCreateList("Master_Job_List", JB_Type); job_list_read_from_disk(master_job_list, "Master_Job_List", 0, SPOOL_WITHIN_EXECD, job_initialize_job); /* clean up jobs hanging around (look in active_dir) */ clean_up_old_jobs(ctx, 1); execd_trash_load_report(); sge_set_flush_lr_flag(true); sge_sig_handler_in_main_loop = 1; if (thread_prof_active_by_id(pthread_self())) { prof_start(SGE_PROF_CUSTOM1, NULL); prof_start(SGE_PROF_CUSTOM2, NULL); prof_start(SGE_PROF_GDI_REQUEST, NULL); } else { prof_stop(SGE_PROF_CUSTOM1, NULL); prof_stop(SGE_PROF_CUSTOM2, NULL); prof_stop(SGE_PROF_GDI_REQUEST, NULL); } PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1); /* Start dispatching */ execd_exit_state = sge_execd_process_messages(ctx); /* * This code is only reached when dispatcher terminates and execd goes down. */ /* log if we received SIGPIPE signal */ if (sge_sig_handler_sigpipe_received) { sge_sig_handler_sigpipe_received = 0; INFO((SGE_EVENT, "SIGPIPE received\n")); } #if defined(LINUX) free_procList(); #endif lFreeList(master_job_list); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1); if (prof_is_active(SGE_PROF_ALL)) { time_t now = (time_t)sge_get_gmt(); if (now > next_prof_output) { prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n"); prof_reset(SGE_PROF_ALL,NULL); next_prof_output = now + 60; } } sge_prof_cleanup(); sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); }