static int init_framework(sge_gdi_ctx_class_t *ctx, bdb_info *info) { int ret = EXIT_FAILURE; lList *answer_list = NULL; lListElem *spooling_context = NULL; const char *spooling_method = ctx->get_spooling_method(ctx); const char *spooling_lib = ctx->get_spooling_lib(ctx); const char *spooling_params = ctx->get_spooling_params(ctx); DENTER(TOP_LAYER, "init_framework"); #ifdef HP1164 sge_set_admin_username("none", NULL); #endif /* create spooling context */ spooling_context = spool_create_dynamic_context(&answer_list, spooling_method, spooling_lib, spooling_params); answer_list_output(&answer_list); if (!strcmp(bootstrap_get_spooling_method(),"classic")) { CRITICAL((SGE_EVENT, MSG_SPOOLDEFAULTS_CANTHANDLECLASSICSPOOLING)); } else if (spooling_context == NULL) { CRITICAL((SGE_EVENT, MSG_SPOOLDEFAULTS_CANNOTCREATECONTEXT)); } else { spool_set_default_context(spooling_context); spool_set_option(&answer_list, spooling_context, "recover=false"); answer_list_output(&answer_list); /* initialize spooling context */ if (!spool_startup_context(&answer_list, spooling_context, true)) { CRITICAL((SGE_EVENT, MSG_SPOOLDEFAULTS_CANNOTSTARTUPCONTEXT)); } else { /* search the berkeley db info - take it from any object type, * berkeleydb spools all objects using the same rule. */ lListElem *type = spool_context_search_type(spooling_context, SGE_TYPE_JOB); lListElem *rule = spool_type_search_default_rule(type); *info = (bdb_info)lGetRef(rule, SPR_clientdata); ret = EXIT_SUCCESS; } answer_list_output(&answer_list); } DRETURN(ret); }
/****** qmaster/setup_qmaster/sge_qmaster_thread_init() ************************ * NAME * sge_qmaster_thread_init() -- Initialize a qmaster thread. * * SYNOPSIS * int sge_qmaster_thread_init(bool switch_to_admin_user) * * FUNCTION * Subsume functions which need to be called immediately after thread * startup. This function does make sure that the thread local data * structures do contain reasonable values. * * INPUTS * bool switch_to_admin_user - become admin user if set to true * * RESULT * 0 - success * * NOTES * MT-NOTE: sge_qmaster_thread_init() is MT safe * MT-NOTE: * MT-NOTE: sge_qmaster_thread_init() should be invoked at the beginning * MT-NOTE: of a thread function. * *******************************************************************************/ int sge_qmaster_thread_init(sge_gdi_ctx_class_t **ctx_ref, u_long32 prog_id, u_long32 thread_id, bool switch_to_admin_user) { const char *admin_user = NULL; lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER(TOP_LAYER, "sge_qmaster_thread_init"); lInit(nmv); if (sge_setup2(ctx_ref, prog_id, thread_id, &alp, true) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)ctx_ref, 1); } ctx = *ctx_ref; ctx->reresolve_qualified_hostname(ctx); DEBUG((SGE_EVENT,"%s: qualified hostname \"%s\"\n", SGE_FUNC, ctx->get_qualified_hostname(ctx))); admin_user = ctx->get_admin_user(ctx); if (switch_to_admin_user == true) { char str[MAX_STRING_SIZE]; if (sge_set_admin_username(admin_user, str) == -1) { CRITICAL((SGE_EVENT, SFNMAX, str)); SGE_EXIT((void**)ctx_ref, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_ERROR_CANTSWITCHTOADMINUSER)); SGE_EXIT((void**)ctx_ref, 1); } } DEXIT; return 0; } /* sge_qmaster_thread_init() */
/*-------------------------------------------------------------------*/ void sge_setup_sge_execd(sge_gdi_ctx_class_t *ctx, const char* tmp_err_file_name) { char err_str[MAX_STRING_SIZE]; int allowed_get_conf_errors = 5; char* spool_dir = NULL; const char *unqualified_hostname = ctx->get_unqualified_hostname(ctx); const char *admin_user = ctx->get_admin_user(ctx); DENTER(TOP_LAYER, "sge_setup_sge_execd"); /* TODO: is this the right place to switch the user ? ports below 1024 ok */ /* ** switch to admin user */ if (sge_set_admin_username(admin_user, err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); /* TODO: remove */ SGE_EXIT(NULL, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_ERROR_CANTSWITCHTOADMINUSER)); /* TODO: remove */ SGE_EXIT(NULL, 1); } while (gdi2_wait_for_conf(ctx, &Execd_Config_List)) { if (allowed_get_conf_errors-- <= 0) { CRITICAL((SGE_EVENT, SFNMAX, MSG_EXECD_CANT_GET_CONFIGURATION_EXIT)); /* TODO: remove */ SGE_EXIT(NULL, 1); } sleep(1); ctx->get_master(ctx, true); } sge_show_conf(); /* get aliased hostname */ /* TODO: is this call needed ? */ ctx->reresolve_qualified_hostname(ctx); spool_dir = mconf_get_execd_spool_dir(); DPRINTF(("chdir(\"/\")----------------------------\n")); sge_chdir_exit("/",1); DPRINTF(("Making directories----------------------------\n")); sge_mkdir(spool_dir, 0755, 1, 0); DPRINTF(("chdir(\"%s\")----------------------------\n", spool_dir)); sge_chdir_exit(spool_dir,1); sge_mkdir(unqualified_hostname, 0755, 1, 0); DPRINTF(("chdir(\"%s\",me.unqualified_hostname)--------------------------\n", unqualified_hostname)); sge_chdir_exit(unqualified_hostname, 1); /* having passed the previous statement we may log messages into the ERR_FILE */ if ( tmp_err_file_name != NULL) { sge_copy_append((char*)tmp_err_file_name, ERR_FILE, SGE_MODE_APPEND); } sge_switch2start_user(); if ( tmp_err_file_name != NULL) { unlink(tmp_err_file_name); } sge_switch2admin_user(); log_state_set_log_as_admin_user(1); sprintf(execd_messages_file, "%s/%s/%s", spool_dir, unqualified_hostname, ERR_FILE); log_state_set_log_file(execd_messages_file); sprintf(execd_spool_dir, "%s/%s", spool_dir, unqualified_hostname); DPRINTF(("Making directories----------------------------\n")); sge_mkdir(EXEC_DIR, 0775, 1, 0); sge_mkdir(JOB_DIR, 0775, 1, 0); sge_mkdir(ACTIVE_DIR, 0775, 1, 0); #if defined(PLPA_LINUX) || defined(SOLARIS86) || defined(SOLARISAMD64) /* initialize processor topology */ if (initialize_topology() != true) { DPRINTF(("Couldn't initizalize topology-----------------------\n")); } #endif sge_free(&spool_dir); DRETURN_VOID; }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }