int main(int argc, char **argv) { lList *pcmdline = NULL; lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; lList *ar_lp = NULL; lListElem *ar = NULL; DENTER_MAIN(TOP_LAYER, "qrsub"); /* Set up the program information name */ sge_setup_sig_handlers(QRSUB); log_state_set_log_gui(1); if (sge_gdi2_setup(&ctx, QRSUB, MAIN_THREAD, &alp) != AE_OK) { answer_list_output(&alp); goto error_exit; } /* ** stage 1 of commandline parsing */ { dstring file = DSTRING_INIT; const char *user = ctx->get_username(ctx); const char *cell_root = ctx->get_cell_root(ctx); /* arguments from SGE_ROOT/common/sge_ar_request file */ get_root_file_path(&file, cell_root, SGE_COMMON_DEF_AR_REQ_FILE); if ((alp = parse_script_file(QRSUB, sge_dstring_get_string(&file), "", &pcmdline, environ, FLG_HIGHER_PRIOR | FLG_IGN_NO_FILE)) == NULL) { /* arguments from $HOME/.sge_ar_request file */ if (get_user_home_file_path(&file, SGE_HOME_DEF_AR_REQ_FILE, user, &alp)) { lFreeList(&alp); alp = parse_script_file(QRSUB, sge_dstring_get_string(&file), "", &pcmdline, environ, FLG_HIGHER_PRIOR | FLG_IGN_NO_FILE); } } sge_dstring_free(&file); if (alp) { answer_list_output(&alp); lFreeList(&pcmdline); goto error_exit; } } alp = cull_parse_cmdline(QRSUB, argv+1, environ, &pcmdline, FLG_USE_PSEUDOS); if (answer_list_print_err_warn(&alp, NULL, "qrsub: ", MSG_WARNING) > 0) { lFreeList(&pcmdline); goto error_exit; } if (!pcmdline) { /* no command line option is present: print help to stderr */ sge_usage(QRSUB, stderr); fprintf(stderr, "%s\n", MSG_PARSE_NOOPTIONARGUMENT); goto error_exit; } /* ** stage 2 of command line parsing */ ar = lCreateElem(AR_Type); if (!sge_parse_qrsub(ctx, pcmdline, &alp, &ar)) { answer_list_output(&alp); lFreeList(&pcmdline); goto error_exit; } ar_lp = lCreateList(NULL, AR_Type); lAppendElem(ar_lp, ar); alp = ctx->gdi(ctx, SGE_AR_LIST, SGE_GDI_ADD | SGE_GDI_RETURN_NEW_VERSION, &ar_lp, NULL, NULL); lFreeList(&ar_lp); answer_list_on_error_print_or_exit(&alp, stdout); if (answer_list_has_error(&alp)) { sge_gdi2_shutdown((void**)&ctx); sge_prof_cleanup(); if (answer_list_has_status(&alp, STATUS_NOTOK_DOAGAIN)) { DRETURN(25); } else { DRETURN(1); } } sge_gdi2_shutdown((void**)&ctx); sge_prof_cleanup(); DRETURN(0); error_exit: sge_gdi2_shutdown((void**)&ctx); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 1); DRETURN(1); }
int main(int argc, char *argv[]) { qevent_options enabled_options; dstring errors = DSTRING_INIT; int i, gdi_setup; lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; sge_evc_class_t *evc = NULL; DENTER_MAIN(TOP_LAYER, "qevent"); /* sge_mt_init(); */ /* dump pid to file */ qevent_dump_pid_file(); /* parse command line */ enabled_options.error_message = &errors; qevent_set_option_struct(&enabled_options); qevent_parse_command_line(argc, argv, &enabled_options); /* check if help option is set */ if (enabled_options.help_option) { qevent_show_usage(); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 0); } /* are there command line parsing errors ? */ if (sge_dstring_get_string(enabled_options.error_message)) { ERROR((SGE_EVENT, "%s", sge_dstring_get_string(enabled_options.error_message) )); qevent_show_usage(); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 1); } log_state_set_log_gui(1); sge_setup_sig_handlers(QEVENT); /* setup event client */ gdi_setup = sge_gdi2_setup(&ctx, QEVENT, MAIN_THREAD, &alp); if (gdi_setup != AE_OK) { answer_list_output(&alp); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 1); } /* TODO: how is the memory we allocate here released ???, SGE_EXIT doesn't */ if (false == sge_gdi2_evc_setup(&evc, ctx, EV_ID_ANY, &alp, NULL)) { answer_list_output(&alp); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 1); } /* ok, start over ... */ /* check for testsuite option */ if (enabled_options.testsuite_option) { /* only for testsuite */ qevent_testsuite_mode(evc); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 0); } /* check for subscribe option */ if (enabled_options.subscribe_option) { /* only for testsuite */ qevent_subscribe_mode(evc); sge_dstring_free(enabled_options.error_message); SGE_EXIT((void**)&ctx, 0); } if (enabled_options.trigger_option_count > 0) { lCondition *where =NULL; lEnumeration *what = NULL; sge_mirror_initialize(evc, EV_ID_ANY, "sge_mirror -trigger", true, NULL, NULL, NULL, NULL, NULL); evc->ec_set_busy_handling(evc, EV_BUSY_UNTIL_ACK); /* put out information about -trigger option */ for (i=0;i<enabled_options.trigger_option_count;i++) { INFO((SGE_EVENT, "trigger script for %s events: %s\n", qevent_get_event_name((enabled_options.trigger_option_events)[i]), (enabled_options.trigger_option_scripts)[i])); switch((enabled_options.trigger_option_events)[i]) { case QEVENT_JB_END: /* build mask for the job structure to contain only the needed elements */ where = NULL; what = lWhat("%T(%I %I %I %I %I %I %I %I)", JB_Type, JB_job_number, JB_ja_tasks, JB_ja_structure, JB_ja_n_h_ids, JB_ja_u_h_ids, JB_ja_s_h_ids,JB_ja_o_h_ids, JB_ja_template); /* register for job events */ sge_mirror_subscribe(evc, SGE_TYPE_JOB, analyze_jatask_event, NULL, NULL, where, what); evc->ec_set_flush(evc, sgeE_JOB_DEL,true, 1); /* the mirror interface registers more events, than we need, thus we free the ones, we do not need */ /* evc->ec_unsubscribe(evc, sgeE_JOB_LIST); */ evc->ec_unsubscribe(evc, sgeE_JOB_MOD); evc->ec_unsubscribe(evc, sgeE_JOB_MOD_SCHED_PRIORITY); evc->ec_unsubscribe(evc, sgeE_JOB_USAGE); evc->ec_unsubscribe(evc, sgeE_JOB_FINAL_USAGE); /* evc->ec_unsubscribe(evc, sgeE_JOB_ADD); */ /* free the what and where mask */ lFreeWhere(&where); lFreeWhat(&what); break; case QEVENT_JB_TASK_END: /* build mask for the job structure to contain only the needed elements */ where = NULL; what = lWhat("%T(%I)", JAT_Type, JAT_status); /* register for JAT events */ sge_mirror_subscribe(evc, SGE_TYPE_JATASK, analyze_jatask_event, NULL, NULL, where, what); evc->ec_set_flush(evc, sgeE_JATASK_DEL,true, 1); /* the mirror interface registers more events, than we need, thus we free the ones, we do not need */ evc->ec_unsubscribe(evc, sgeE_JATASK_ADD); evc->ec_unsubscribe(evc, sgeE_JATASK_MOD); /* free the what and where mask */ lFreeWhere(&where); lFreeWhat(&what); break; } } while(!shut_me_down) { sge_mirror_error error = sge_mirror_process_events(evc); if (error == SGE_EM_TIMEOUT && !shut_me_down ) { sleep(10); continue; } } sge_mirror_shutdown(evc); sge_dstring_free(enabled_options.error_message); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 0); return 0; } ERROR((SGE_EVENT, "no option selected\n" )); qevent_show_usage(); sge_dstring_free(enabled_options.error_message); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 1); return 1; }
int main(int argc, char **argv) { int ret = 0; lList *pcmdline = NULL; lList *answer_list = NULL; sge_gdi_ctx_class_t *ctx = NULL; qrstat_env_t qrstat_env; DENTER_MAIN(TOP_LAYER, "qrsub"); /* Set up the program information name */ sge_setup_sig_handlers(QRSTAT); log_state_set_log_gui(1); if (sge_gdi2_setup(&ctx, QRSTAT, MAIN_THREAD, &answer_list) != AE_OK) { answer_list_output(&answer_list); goto error_exit; } qrstat_filter_init(&qrstat_env); qrstat_filter_set_ctx(&qrstat_env, ctx); /* * stage 1: commandline parsing */ { dstring file = DSTRING_INIT; const char *user = ctx->get_username(ctx); const char *cell_root = ctx->get_cell_root(ctx); /* arguments from SGE_ROOT/common/sge_qrstat file */ get_root_file_path(&file, cell_root, SGE_COMMON_DEF_QRSTAT_FILE); if (sge_parse_from_file_qrstat(sge_dstring_get_string(&file), &pcmdline, &answer_list) == true) { /* arguments from $HOME/.sge_qrstat file */ if (get_user_home_file_path(&file, SGE_HOME_DEF_QRSTAT_FILE, user, &answer_list)) { sge_parse_from_file_qrstat(sge_dstring_get_string(&file), &pcmdline, &answer_list); } } sge_dstring_free(&file); if (answer_list) { answer_list_output(&answer_list); lFreeList(&pcmdline); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 1); } } answer_list = cull_parse_cmdline(QRSTAT, argv+1, environ, &pcmdline, FLG_USE_PSEUDOS); if (answer_list != NULL) { answer_list_output(&answer_list); lFreeList(&pcmdline); goto error_exit; } /* * stage 2: evalutate switches and modify qrstat_env */ if (!sge_parse_qrstat(ctx, &answer_list, &qrstat_env, &pcmdline)) { answer_list_output(&answer_list); lFreeList(&pcmdline); goto error_exit; } /* * stage 3: fetch data from master */ { answer_list = ctx->gdi(ctx, SGE_AR_LIST, SGE_GDI_GET, &qrstat_env.ar_list, qrstat_env.where_AR_Type, qrstat_env.what_AR_Type, false); if (answer_list_has_error(&answer_list)) { answer_list_output(&answer_list); goto error_exit; } } /* * stage 4: create output in correct format */ { qrstat_report_handler_t *handler = NULL; if (qrstat_env.is_xml) { handler = qrstat_create_report_handler_xml(&qrstat_env, &answer_list); } else { handler = qrstat_create_report_handler_stdout(&qrstat_env, &answer_list); } if (!qrstat_print(&answer_list, handler, &qrstat_env)) { ret = 1; } if (qrstat_env.is_xml) { qrstat_destroy_report_handler_xml(&handler, &answer_list); } else { qrstat_destroy_report_handler_stdout(&handler, &answer_list); } } sge_gdi2_shutdown((void**)&ctx); sge_prof_cleanup(); DRETURN(ret); error_exit: sge_gdi2_shutdown((void**)&ctx); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 1); DRETURN(1); }
int main(int argc, char **argv) { lList *opts_cmdline = NULL; lList *opts_defaults = NULL; lList *opts_scriptfile = NULL; lList *opts_all = NULL; lListElem *job = NULL; lList *alp = NULL; lListElem *ep; int exit_status = 0; int just_verify; int tmp_ret; int wait_for_job = 0, is_immediate = 0; dstring session_key_out = DSTRING_INIT; dstring diag = DSTRING_INIT; dstring jobid = DSTRING_INIT; u_long32 start, end, step; u_long32 num_tasks; int count, stat; char *jobid_string = NULL; bool has_terse; drmaa_attr_values_t *jobids = NULL; u_long32 prog_number = 0; u_long32 myuid = 0; const char *sge_root = NULL; const char *cell_root = NULL; const char *username = NULL; const char *qualified_hostname = NULL; const char *unqualified_hostname = NULL; const char *mastername = NULL; DENTER_MAIN(TOP_LAYER, "qsub"); prof_mt_init(); /* Set up the program information name */ sge_setup_sig_handlers(QSUB); DPRINTF(("Initializing JAPI\n")); if (japi_init(NULL, NULL, NULL, QSUB, false, NULL, &diag) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); DEXIT; SGE_EXIT((void**)&ctx, 1); } prog_number = ctx->get_who(ctx); myuid = ctx->get_uid(ctx); sge_root = ctx->get_sge_root(ctx); cell_root = ctx->get_cell_root(ctx); username = ctx->get_username(ctx); qualified_hostname = ctx->get_qualified_hostname(ctx); unqualified_hostname = ctx->get_unqualified_hostname(ctx); mastername = ctx->get_master(ctx, false); /* * read switches from the various defaults files */ opt_list_append_opts_from_default_files(prog_number, cell_root, username, &opts_defaults, &alp, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, NULL, MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } /* * append the commandline switches to the list */ opt_list_append_opts_from_qsub_cmdline(prog_number, &opts_cmdline, &alp, argv + 1, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_QSUB_WARNING_S); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } /* * show usage if -help was in commandline */ if (opt_list_has_X(opts_cmdline, "-help")) { sge_usage(QSUB, stdout); DEXIT; SGE_EXIT((void**)&ctx, 0); } /* * We will only read commandline options from scripfile if the script * itself should not be handled as binary */ if (opt_list_is_X_true(opts_cmdline, "-b") || (!opt_list_has_X(opts_cmdline, "-b") && opt_list_is_X_true(opts_defaults, "-b"))) { DPRINTF(("Skipping options from script due to -b option\n")); } else { opt_list_append_opts_from_script(prog_number, &opts_scriptfile, &alp, opts_cmdline, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, MSG_QSUB_COULDNOTREADSCRIPT_S, MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } } /* * Merge all commandline options and interprete them */ opt_list_merge_command_lines(&opts_all, &opts_defaults, &opts_scriptfile, &opts_cmdline); /* * Check if -terse is requested */ has_terse = opt_list_has_X(opts_all, "-terse"); /* If "-sync y" is set, wait for the job to end. */ /* Remove all -sync switches since cull_parse_job_parameter() * doesn't know what to do with them. */ while ((ep = lGetElemStr(opts_all, SPA_switch, "-sync"))) { if (lGetInt(ep, SPA_argval_lIntT) == TRUE) { wait_for_job = 1; } lRemoveElem(opts_all, &ep); } if (wait_for_job) { DPRINTF(("Wait for job end\n")); } alp = cull_parse_job_parameter(myuid, username, cell_root, unqualified_hostname, qualified_hostname, opts_all, &job); tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } if (set_sec_cred(sge_root, mastername, job, &alp) != 0) { answer_list_output(&alp); DEXIT; SGE_EXIT((void**)&ctx, 1); } /* Check if job is immediate */ is_immediate = (int)JOB_TYPE_IS_IMMEDIATE(lGetUlong(job, JB_type)); DPRINTF(("Job is%s immediate\n", is_immediate ? "" : " not")); DPRINTF(("Everything ok\n")); if (lGetUlong(job, JB_verify)) { cull_show_job(job, 0, false); DEXIT; SGE_EXIT((void**)&ctx, 0); } if (is_immediate || wait_for_job) { pthread_t sigt; qsub_setup_sig_handlers(); if (pthread_create(&sigt, NULL, sig_thread, (void *)NULL) != 0) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, " error preparing signal handling thread"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } if (japi_enable_job_wait(username, unqualified_hostname, NULL, &session_key_out, error_handler, &diag) == DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) { const char *msg = sge_dstring_get_string(&diag); fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, msg?msg:" error starting event client thread"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } } job_get_submit_task_ids(job, &start, &end, &step); num_tasks = (end - start) / step + 1; if (num_tasks > 1) { int error = japi_run_bulk_jobs(&jobids, &job, start, end, step, &diag); if (error != DRMAA_ERRNO_SUCCESS) { /* No active session here means that japi_enable_job_wait() was * interrupted by the signal handler, in which case we just break out * quietly. */ if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } /* BUGFIX: Issuezilla #1013 * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code * back into a GDI error code. This is the easy solution. The * correct solution would be to address issue #859, presumably by * having JAPI reuse the GDI error codes instead of the JAPI error * codes. */ if (error == DRMAA_ERRNO_TRY_LATER) { exit_status = STATUS_NOTOK_DOAGAIN; } else { exit_status = 1; } goto Error; } DPRINTF(("job id is: %ld\n", jobids->it.ji.jobid)); jobid_string = get_bulk_jobid_string((long)jobids->it.ji.jobid, start, end, step); } else if (num_tasks == 1) { int error = japi_run_job(&jobid, &job, &diag); if (error != DRMAA_ERRNO_SUCCESS) { if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } /* BUGFIX: Issuezilla #1013 * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code * back into a GDI error code. This is the easy solution. The * correct solution would be to address issue #859, presumably by * having JAPI reuse the GDI error codes instead of the DRMAA error * codes. */ if (error == DRMAA_ERRNO_TRY_LATER) { exit_status = STATUS_NOTOK_DOAGAIN; } else { exit_status = 1; } goto Error; } jobid_string = strdup(sge_dstring_get_string(&jobid)); DPRINTF(("job id is: %s\n", jobid_string)); sge_dstring_free(&jobid); } else { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, "invalid task structure"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } /* only success message is printed to stdout */ just_verify = (lGetUlong(job, JB_verify_suitable_queues)==JUST_VERIFY || lGetUlong(job, JB_verify_suitable_queues)==POKE_VERIFY); DPRINTF(("Just verifying job\n")); if (!just_verify) { const char *output = sge_dstring_get_string(&diag); /* print the tersed output */ if (has_terse) { printf("%s", jobid_string); } else if (output != NULL) { printf("%s", output); } else { printf(MSG_QSUB_YOURJOBHASBEENSUBMITTED_SS, jobid_string, lGetString(job, JB_job_name)); } printf("\n"); } else { printf(MSG_JOB_VERIFYFOUNDQ); printf("\n"); } if ((wait_for_job || is_immediate) && !just_verify) { int event; if (is_immediate) { fprintf(stderr, "%s\n", MSG_QSUB_WAITINGFORIMMEDIATEJOBTOBESCHEDULED); /* We only need to wait for the first task to be scheduled to be able * to say that the job is running. */ tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_START, &event, NULL, &diag); if ((tmp_ret == DRMAA_ERRNO_SUCCESS) && (event == JAPI_JOB_START)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_YOURIMMEDIATEJOBXHASBEENSUCCESSFULLYSCHEDULED_S, jobid_string); fprintf(stderr, "\n"); } /* A job finish event here means that the job was rejected. */ else if ((tmp_ret == DRMAA_ERRNO_SUCCESS) && (event == JAPI_JOB_FINISH)) { fprintf(stderr, "\n%s\n", MSG_QSUB_YOURQSUBREQUESTCOULDNOTBESCHEDULEDDTRYLATER); exit_status = 1; goto Error; } else { /* Since we told japi_wait to wait forever, we know that if it gets * a timeout, it's because it's been interrupted to exit, in which * case we don't complain. Same for no active session. */ if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) && (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } exit_status = 1; goto Error; } } if (wait_for_job) { /* Rather than using japi_synchronize on ALL for bulk jobs, we use * japi_wait on ANY num_tasks times because with synchronize, we would * have to wait for all the tasks to finish before we know if any * finished. */ for (count = 0; count < num_tasks; count++) { /* Since there's only one running job in the session, we can just * wait for ANY. */ if ((tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event, NULL, &diag)) != DRMAA_ERRNO_SUCCESS) { if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) && (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } exit_status = 1; goto Error; } /* report how job finished */ /* If the job is an array job, use the first non-zero exit code as * the exit code for qsub. */ if (exit_status == 0) { exit_status = report_exit_status(stat, sge_dstring_get_string(&jobid)); } /* If we've already found a non-zero exit code, just print the exit * info for the task. */ else { report_exit_status(stat, sge_dstring_get_string(&jobid)); } } } } Error: FREE(jobid_string); lFreeList(&alp); lFreeList(&opts_all); if ((tmp_ret = japi_exit(JAPI_EXIT_NO_FLAG, &diag)) != DRMAA_ERRNO_SUCCESS) { if (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTFINALIZEENV_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } else { struct timespec ts; /* We know that if we get a DRMAA_ERRNO_NO_ACTIVE_SESSION here, it's * because the signal handler thread called japi_exit(). We know this * because if the call to japi_init() fails, we just exit directly. * If the call to japi_init() succeeds, then we have an active session, * so coming here because of an error would not result in the * DRMAA_ERRNO_NO_ACTIVE_SESSION error. */ DPRINTF(("Sleeping for 15 seconds to wait for the exit to finish.\n")); sge_relative_timespec(15, &ts); sge_mutex_lock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex); while (!exited) { if (pthread_cond_timedwait(&exit_cv, &exit_mutex, &ts) == ETIMEDOUT) { DPRINTF(("Exit has not finished after 15 seconds. Exiting.\n")); break; } } sge_mutex_unlock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex); } } sge_prof_cleanup(); /* This is an exit() instead of an SGE_EXIT() because when the qmaster is * supended, SGE_EXIT() hangs. */ exit(exit_status); DEXIT; return exit_status; }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }
int main(int argc, char **argv) { int ret = STATUS_OK; lList *alp = NULL; lList *request_list = NULL; lList *cmdline = NULL; lListElem *aep; int all_jobs = 0; int all_users = 0; u_long32 gdi_cmd = SGE_GDI_MOD; int tmp_ret; int me_who; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "qalter"); prof_mt_init(); /* ** get command name: qalter or qresub */ if (!strcmp(sge_basename(argv[0], '/'), "qresub")) { DPRINTF(("QRESUB\n")); me_who = QRESUB; } else if (!strcmp(sge_basename(argv[0], '/'), "qhold")) { DPRINTF(("QHOLD\n")); me_who = QHOLD; } else if (!strcmp(sge_basename(argv[0], '/'), "qrls")) { DPRINTF(("QRLS\n")); me_who = QRLS; } else { DPRINTF(("QALTER\n")); me_who = QALTER; } log_state_set_log_gui(1); sge_setup_sig_handlers(me_who); if (sge_gdi2_setup(&ctx, me_who, MAIN_THREAD, &alp) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* ** begin to work */ opt_list_append_opts_from_qalter_cmdline(me_who, &cmdline, &alp, argv + 1, environ); tmp_ret = answer_list_print_err_warn(&alp, MSG_QALTER, MSG_QALTER, MSG_QALTERWARNING); if (tmp_ret > 0) { SGE_EXIT((void**)&ctx, tmp_ret); } /* handling the case that no command line parameter was specified */ if ((me_who == QHOLD || me_who == QRLS) && lGetNumberOfElem(cmdline) == 1) { /* -h option is set implicitly for QHOLD and QRLS */ sge_usage(me_who, stderr); fprintf(stderr, "%s\n", MSG_PARSE_NOOPTIONARGUMENT); SGE_EXIT((void**)&ctx, 1); } else if ((me_who == QRESUB || me_who == QALTER) && lGetNumberOfElem(cmdline) == 0) { /* qresub and qalter have nothing set */ sge_usage(me_who, stderr); fprintf(stderr, "%s\n", MSG_PARSE_NOOPTIONARGUMENT); SGE_EXIT((void**)&ctx, 1); } else if (opt_list_has_X(cmdline, "-help")) { /* -help was specified */ sge_usage(me_who, stdout); SGE_EXIT((void**)&ctx, 0); } alp = qalter_parse_job_parameter(me_who, cmdline, &request_list, &all_jobs, &all_users); DPRINTF(("all_jobs = %d, all_user = %d\n", all_jobs, all_users)); if (request_list && verify) { /* got a request list containing one element for each job to be modified save jobid all fields contain the same fields so we may use show_job() with the first job in our list The jobid's in our request list get printed before show_job() */ cull_show_job(lFirst(request_list), FLG_QALTER, false); sge_prof_cleanup(); SGE_EXIT((void**)&ctx, 0); } tmp_ret = answer_list_print_err_warn(&alp, NULL, NULL, MSG_WARNING); if (tmp_ret > 0) { SGE_EXIT((void**)&ctx, tmp_ret); } if ((me_who == QALTER) || (me_who == QHOLD) || (me_who == QRLS) ) { DPRINTF(("QALTER\n")); gdi_cmd = SGE_GDI_MOD; } else if (me_who == QRESUB){ DPRINTF(("QRESUB\n")); gdi_cmd = SGE_GDI_COPY; } else { printf("unknown binary name.\n"); SGE_EXIT((void**)&ctx, 1); } if (all_jobs) gdi_cmd |= SGE_GDI_ALL_JOBS; if (all_users) gdi_cmd |= SGE_GDI_ALL_USERS; alp = ctx->gdi(ctx, SGE_JB_LIST, gdi_cmd, &request_list, NULL, NULL); for_each (aep, alp) { printf("%s\n", lGetString(aep, AN_text)); if (ret == STATUS_OK) { ret = lGetUlong(aep, AN_status); } }
/*-------------------------------------------------------------------------*/ int main(int argc, char **argv) { int ret; int my_pid; int ret_val; int printed_points = 0; int max_enroll_tries; static char tmp_err_file_name[SGE_PATH_MAX]; time_t next_prof_output = 0; int execd_exit_state = 0; lList **master_job_list = NULL; sge_gdi_ctx_class_t *ctx = NULL; lList *alp = NULL; DENTER_MAIN(TOP_LAYER, "execd"); #if defined(LINUX) gen_procList (); #endif prof_mt_init(); set_thread_name(pthread_self(),"Execd Thread"); prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ /* This needs a better solution */ umask(022); /* Initialize path for temporary logging until we chdir to spool */ my_pid = getpid(); sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid)); log_state_set_log_file(tmp_err_file_name); /* exit func for SGE_EXIT() */ sge_sig_handler_in_main_loop = 0; sge_setup_sig_handlers(EXECD); if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } ctx->set_exit_func(ctx, execd_exit_func); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif /* prepare daemonize */ if (!getenv("SGE_ND")) { sge_daemonize_prepare(ctx); } if ((ret=sge_occupy_first_three())>=0) { CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret)); SGE_EXIT((void**)&ctx, 1); } lInit(nmv); /* unset XAUTHORITY if set */ if (getenv("XAUTHORITY") != NULL) { sge_unsetenv("XAUTHORITY"); } parse_cmdline_execd(argv); /* exit if we can't get communication handle (bind port) */ max_enroll_tries = 30; while (cl_com_get_handle(prognames[EXECD],1) == NULL) { ctx->prepare_enroll(ctx); max_enroll_tries--; if (max_enroll_tries <= 0 || shut_me_down) { /* exit after 30 seconds */ if (printed_points != 0) { printf("\n"); } CRITICAL((SGE_EVENT, MSG_COM_ERROR)); SGE_EXIT((void**)&ctx, 1); } if (cl_com_get_handle(prognames[EXECD],1) == NULL) { /* sleep when prepare_enroll() failed */ sleep(1); if (max_enroll_tries < 27) { printf("."); printed_points++; fflush(stdout); } } } if (printed_points != 0) { printf("\n"); } /* * now the commlib up and running. Set execd application status function * ( commlib callback function for qping status information response * messages (SIRM) ) */ ret_val = cl_com_set_status_func(sge_execd_application_status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val)) ); } /* test connection */ { cl_com_SIRM_t* status = NULL; ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx), (char *)ctx->get_master(ctx, true), (char*)prognames[QMASTER], 1, &status); if (ret_val != CL_RETVAL_OK) { ERROR((SGE_EVENT, cl_get_error_text(ret_val))); ERROR((SGE_EVENT, MSG_CONF_NOCONFBG)); } cl_com_free_sirm_message(&status); } /* finalize daeamonize */ if (!getenv("SGE_ND")) { sge_daemonize_finalize(ctx); } /* daemonizes if qmaster is unreachable */ sge_setup_sge_execd(ctx, tmp_err_file_name); /* are we using qidle or not */ sge_ls_qidle(mconf_get_use_qidle()); sge_ls_gnu_ls(1); DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle())); /* test load sensor (internal or external) */ { lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); lFreeList(&report_list); } /* here we have to wait for qmaster registration */ while (sge_execd_register_at_qmaster(ctx, false) != 0) { if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) { /* This is no error */ DPRINTF(("***** got SGE_COM_ACCESS_DENIED from qmaster *****\n")); } if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) { execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE; break; } if (shut_me_down != 0) { break; } sleep(30); } /* * Terminate on SIGTERM or hard communication error */ if (execd_exit_state != 0 || shut_me_down != 0) { sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); } /* * We write pid file when we are connected to qmaster. Otherwise an old * execd might overwrite our pidfile. */ sge_write_pid(EXECD_PID_FILE); /* * At this point we are sure we are the only sge_execd and we are connected * to the current qmaster. First we have to report any reaped children * that might exist. */ starting_up(); /* * Log a warning message if execd hasn't been started by a superuser */ if (!sge_is_start_user_superuser()) { WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT)); } #ifdef COMPILE_DC if (ptf_init()) { CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF)); SGE_EXIT((void**)&ctx, 1); } INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF)); #endif master_job_list = object_type_get_master_list(SGE_TYPE_JOB); *master_job_list = lCreateList("Master_Job_List", JB_Type); job_list_read_from_disk(master_job_list, "Master_Job_List", 0, SPOOL_WITHIN_EXECD, job_initialize_job); /* clean up jobs hanging around (look in active_dir) */ clean_up_old_jobs(ctx, 1); execd_trash_load_report(); sge_set_flush_lr_flag(true); sge_sig_handler_in_main_loop = 1; if (thread_prof_active_by_id(pthread_self())) { prof_start(SGE_PROF_CUSTOM1, NULL); prof_start(SGE_PROF_CUSTOM2, NULL); prof_start(SGE_PROF_GDI_REQUEST, NULL); } else { prof_stop(SGE_PROF_CUSTOM1, NULL); prof_stop(SGE_PROF_CUSTOM2, NULL); prof_stop(SGE_PROF_GDI_REQUEST, NULL); } PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1); /* Start dispatching */ execd_exit_state = sge_execd_process_messages(ctx); /* * This code is only reached when dispatcher terminates and execd goes down. */ /* log if we received SIGPIPE signal */ if (sge_sig_handler_sigpipe_received) { sge_sig_handler_sigpipe_received = 0; INFO((SGE_EVENT, "SIGPIPE received\n")); } #if defined(LINUX) free_procList(); #endif lFreeList(master_job_list); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1); if (prof_is_active(SGE_PROF_ALL)) { time_t now = (time_t)sge_get_gmt(); if (now > next_prof_output) { prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n"); prof_reset(SGE_PROF_ALL,NULL); next_prof_output = now + 60; } } sge_prof_cleanup(); sge_shutdown((void**)&ctx, execd_exit_state); DRETURN(execd_exit_state); }