/****** uti/sge_log/sge_do_log() *********************************************** * NAME * sge_do_log() -- Write message to log file * * SYNOPSIS * static void sge_do_log(int aLevel, const char *aMessage, const char * * FUNCTION * ??? * * INPUTS * int aLevel - log level * const char *aMessage - log message * * RESULT * void - none * * NOTES * MT-NOTE: sge_do_log() is MT safe. * *******************************************************************************/ static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname, int aLevel, const char *aMessage) { int fd; if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) { if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) { char msg2log[4*MAX_STRING_SIZE]; dstring msg; int len; sge_dstring_init(&msg, msg2log, sizeof(msg2log)); append_time((time_t)sge_get_gmt(), &msg, false); sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n", progname, unqualified_hostname, aLevel, aMessage); len = strlen(msg2log); if (write(fd, msg2log, len) != len) { /* we are in error logging here - the only chance to log this problem * might be to write it to stderr */ fprintf(stderr, "can't log to file %s: %s\n", log_state_get_log_file(), sge_strerror(errno, &msg)); } close(fd); } } return; } /* sge_do_log() */
static void qevent_show_usage(void) { dstring ds; char buffer[256]; sge_dstring_init(&ds, buffer, sizeof(buffer)); fprintf(stdout, "%s\n", feature_get_product_name(FS_SHORT_VERSION, &ds)); fprintf(stdout, "%s\n", MSG_SRC_USAGE ); fprintf(stdout,"qevent [-h|-help] -ts|-testsuite\n"); fprintf(stdout,"qevent [-h|-help] -sm|-subscribe\n"); fprintf(stdout,"qevent [-h|-help] -trigger EVENT SCRIPT [ -trigger EVENT SCRIPT, ... ]\n\n"); fprintf(stdout," -h, -help show usage\n"); fprintf(stdout," -ts, -testsuite run in testsuite mode\n"); fprintf(stdout," -sm, -subscribe run in subscribe mode\n"); fprintf(stdout," -trigger EVENT SCRIPT start SCRIPT (executable) when EVENT occurs\n"); fprintf(stdout,"\n"); fprintf(stdout,"SCRIPT - path to a executable shell script\n"); fprintf(stdout," 1. command line argument: event name\n"); fprintf(stdout," 2. command line argument: jobid\n"); fprintf(stdout," 3. command line argument: taskid\n"); fprintf(stdout,"EVENT - One of the following event category:\n"); fprintf(stdout," %s - job end event\n", qevent_get_event_name(QEVENT_JB_END)); fprintf(stdout," %s - job task end event\n", qevent_get_event_name(QEVENT_JB_TASK_END)); }
/****** uti/sge_log/sge_do_log() *********************************************** * NAME * sge_do_log() -- Write message to log file * * SYNOPSIS * static void sge_do_log(int aLevel, const char *aMessage, const char * * FUNCTION * ??? * * INPUTS * int aLevel - log level * const char *aMessage - log message * * RESULT * void - none * * NOTES * MT-NOTE: sge_do_log() is MT safe. * *******************************************************************************/ static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname, int aLevel, const char *aMessage) { int fd; if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) { if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) { char msg2log[4*MAX_STRING_SIZE]; dstring msg; sge_dstring_init(&msg, msg2log, sizeof(msg2log)); append_time((time_t)sge_get_gmt(), &msg, false); sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n", progname, unqualified_hostname, aLevel, aMessage); write(fd, msg2log, strlen(msg2log)); close(fd); } } return; } /* sge_do_log() */
void xml_addAttributeD(lListElem *xml_elem, const char *name, double value) { char buffer[20]=""; dstring string; sge_dstring_init(&string, buffer, 20); xml_addAttribute(xml_elem, name, sge_dstring_sprintf(&string, "%f", value)); }
int main(int argc, char *argv[]) { bool ret = true; dstring dynamic_dstring = DSTRING_INIT; dstring static_dstring; char static_buffer[MAX_STRING_SIZE]; sge_dstring_init(&static_dstring, static_buffer, STATIC_SIZE); printf("running all checks with a dynamic dstring\n"); ret = check_all(&dynamic_dstring); test_dstring_performance(&dynamic_dstring, 100000, "test_data"); test_dstring_performance_dynamic(100000, "test_data"); printf("%s\n", sge_dstring_get_string(&dynamic_dstring)); if (ret) { printf("\n\nrunning all checks with a static dstring of length %d\n", STATIC_SIZE); ret = check_all(&static_dstring); test_dstring_performance(&static_dstring, 100000, "test_data"); test_dstring_performance_static(100000, "test_data"); printf("%s\n", sge_dstring_get_string(&static_dstring)); } sge_dstring_free(&dynamic_dstring); return ret ? EXIT_SUCCESS : EXIT_FAILURE; }
/*--------------------------------------------------------------------- * parse_cmdline_shadowd *---------------------------------------------------------------------*/ static int parse_cmdline_shadowd( int argc, char **argv ) { dstring ds; char buffer[256]; DENTER(TOP_LAYER, "parse_cmdline_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* ** -help */ if ((argc == 2) && !strcmp(argv[1],"-help")) { #define PRINTITD(o,d) print_option_syntax(stdout,o,d) fprintf(stdout, "%s\n", feature_get_product_name(FS_SHORT_VERSION, &ds)); fprintf(stdout, "%s sge_shadowd [options]\n", MSG_GDI_USAGE_USAGESTRING); PRINTITD(MSG_GDI_USAGE_help_OPT , MSG_GDI_UTEXT_help_OPT ); DRETURN(1); } DRETURN(0); }
/*-------------------------------------------------------------------------*/ static void qmonUsage(Widget w) { dstring ds; char buffer[256]; DENTER(GUI_LAYER, "qmonUsage"); sge_dstring_init(&ds, buffer, sizeof(buffer)); printf("%s %s\n", GE_SHORTNAME, GDI_VERSION); /* printf("%s\n", feature_get_product_name(FS_SHORT_VERSION, &ds)); */ printf(XmtLocalize2(w, "usage: qmon\n", "qmon_usage", "usageTitle")); printf(" [-cmap] "); printf(XmtLocalize2(w, "use own colormap\n", "qmon_usage", "cmapOption")); printf(" [-help] "); printf(XmtLocalize2(w, "show this information and exit\n", "qmon_usage", "helpOption")); printf(" [-fontFamily {big|medium|small}] "); printf(XmtLocalize2(w, "use small/medium/big fonts\n", "qmon_usage", "fontFamilyOption")); printf(" [-nologo] "); printf(XmtLocalize2(w, "startup without logo\n", "qmon_usage", "nologoOption")); printf(XmtLocalize2(w, "Additionally the default X commandline switches can be used.\nFor further information see the manual page X(1)\n", "qmon_usage", "X11OptionInfo")); DEXIT; }
int sge_remove_tmpdir(const char *dir, const char *job_owner, u_long32 jobid, u_long32 jataskid, const char *queue_name) { stringT tmpstr; char err_str_buffer[1024]; dstring err_str; DENTER(TOP_LAYER, "sge_remove_tmpdir"); sge_dstring_init(&err_str, err_str_buffer, sizeof(err_str_buffer)); if (!dir) { DRETURN(0); } sprintf(tmpstr, "%s/"sge_u32"."sge_u32".%s", dir, jobid, jataskid, queue_name); DPRINTF(("recursively unlinking \"%s\"\n", tmpstr)); sge_switch2start_user(); if (sge_rmdir(tmpstr, &err_str)) { ERROR((SGE_EVENT, MSG_FILE_RECURSIVERMDIR_SS, tmpstr, err_str_buffer)); sge_switch2admin_user(); DRETURN(-1); } sge_switch2admin_user(); DRETURN(0); }
static sge_callback_result analyze_jatask_event(sge_evc_class_t *evc, object_description *object_base,sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { char buffer[1024]; dstring buffer_wrapper; sge_dstring_init(&buffer_wrapper, buffer, sizeof(buffer)); if (lGetPosViaElem(event, ET_type, SGE_NO_ABORT) >= 0) { u_long32 type = lGetUlong(event, ET_type); if (type == sgeE_JATASK_MOD) { lList *jat = lGetList(event,ET_new_version); lListElem *ep = lFirst(jat); u_long job_status = lGetUlong(ep, JAT_status); int task_running = (job_status==JRUNNING || job_status==JTRANSFERING); if (task_running) { } } if (type == sgeE_JOB_FINAL_USAGE) { } if (type == sgeE_JOB_ADD) { /* lList *jat = lGetList(event,ET_new_version); u_long job_id = lGetUlong(event, ET_intkey); u_long task_id = lGetUlong(event, ET_intkey2); lListElem *ep = lFirst(jat); */ } if (type == sgeE_JOB_DEL) { qevent_trigger_scripts(QEVENT_JB_END, qevent_get_option_struct(), event); } if (type == sgeE_JATASK_DEL) { qevent_trigger_scripts(QEVENT_JB_TASK_END,qevent_get_option_struct() , event); } } /* create a callback error to test error handling */ if(type == SGE_TYPE_GLOBAL_CONFIG) { return SGE_EMA_FAILURE; } return SGE_EMA_OK; }
static void shepherd_panic(const char *s) { FILE *panic_fp; char panic_file[255]; sprintf(panic_file, "/tmp/shepherd."pid_t_fmt, getpid()); panic_fp = fopen(panic_file, "a"); if (panic_fp) { dstring ds; char buffer[128]; sge_dstring_init(&ds, buffer, sizeof(buffer)); fprintf(panic_fp, "%s ["uid_t_fmt":"uid_t_fmt" "pid_t_fmt"]: PANIC: %s\n", sge_ctime(0, &ds), getuid(), geteuid(), getpid(), s); FCLOSE(panic_fp); } FCLOSE_ERROR: return; }
static void test_dstring_performance_static(int max, const char *data) { int i; struct timeval before; struct timeval after; double time; gettimeofday(&before, NULL); for (i = 0; i < max; i++) { dstring ds; char ds_buffer[MAX_STRING_SIZE]; sge_dstring_init(&ds, ds_buffer, sizeof(ds_buffer)); sge_dstring_sprintf(&ds, "%s/%s", data, data); } gettimeofday(&after, NULL); time = after.tv_usec - before.tv_usec; time = after.tv_sec - before.tv_sec + (time/1000000); printf("%d static dstring creations took %.2fs\n", max, time); }
void qmonAboutMsg(Widget w, XtPointer cld, XtPointer cad) { #if 0 dstring ds; char buffer[256]; const char* username = ctx->get_username(ctx); const char* qualified_hostname = ctx->get_qualified_hostname(ctx); const char* default_cell = ctx->get_default_cell(ctx); DENTER(TOP_LAYER, "qmonAboutMsg"); sge_dstring_init(&ds, buffer, sizeof(buffer)); XmtDisplayMessage(w, "about_msg", "Help", header, "About Qmon", NULL, None, XmDIALOG_MODELESS, XmDIALOG_INFORMATION, username, qualified_hostname, feature_get_product_name(FS_LONG_VERSION, &ds), default_cell, XmtLocalize(w, mailto, "mailto_msg"), SFLN_ELN); sge_dstring_free(&ds); DEXIT; #else char buffer[256]; const char* username = ctx->get_username(ctx); const char* qualified_hostname = ctx->get_qualified_hostname(ctx); const char* default_cell = ctx->get_default_cell(ctx); DENTER(TOP_LAYER, "qmonAboutMsg"); sprintf(buffer, "%s %s", GE_LONGNAME, GDI_VERSION); XmtDisplayMessage(w, "about_msg", "Help", header, "About Qmon", NULL, None, XmDIALOG_MODELESS, XmDIALOG_INFORMATION, username, qualified_hostname, buffer, default_cell, XmtLocalize(w, mailto, "mailto_msg"), SFLN_ELN); DEXIT; #endif }
static sge_callback_result print_event(sge_evc_class_t *evc, object_description *object_base, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { char buffer[1024]; dstring buffer_wrapper; DENTER(TOP_LAYER, "print_event"); sge_dstring_init(&buffer_wrapper, buffer, sizeof(buffer)); fprintf(stdout, "%s\n", event_text(event, &buffer_wrapper)); fflush(stdout); /* create a callback error to test error handling */ if(type == SGE_TYPE_GLOBAL_CONFIG) { DEXIT; return SGE_EMA_FAILURE; } DEXIT; return SGE_EMA_OK; }
/****** sge_gdi_packet/sge_gdi_packet_verify_version() ************************ * NAME * sge_gdi_packet_verify_version() -- verify packet version * * SYNOPSIS * bool sge_gdi_packet_verify_version(sge_gdi_packet_class_t *packet, * lList **alpp) * * FUNCTION * This function is the replacement for the function * verify_request_version() which was part of the source code * before the packet structure was introduced. * * It compares the version information of the provided "packet" * with the compiledin version number GRM_GDI_VERSION. * * If both versions are not the same then it tries to find * if the client which provided us with this packet structure * has a higer version number or the binary executing * this function. In both cases the answer_list will * be filled with an appropriate message. * * INPUTS * sge_gdi_packet_class_t *packet - packet * lList **alpp - answer list * * RESULT * bool - error state * true - same version * false - differnet version numbers * * NOTES * MT-NOTE: sge_gdi_packet_verify_version() is not MT safe ******************************************************************************/ bool sge_gdi_packet_verify_version(sge_gdi_packet_class_t * packet, lList **alpp) { bool ret = true; u_long32 version = packet->version; DENTER(TOP_LAYER, "sge_gdi_packet_verify_version"); if (version != GRM_GDI_VERSION) { char *client_version = NULL; dstring ds; char buffer[256]; const vdict_t *vp; const vdict_t *vdict = GRM_GDI_VERSION_ARRAY; sge_dstring_init(&ds, buffer, sizeof(buffer)); for (vp = &vdict[0]; vp->version; vp++) { if (version == vp->version) { client_version = vp->release; } } if (client_version) { WARNING((SGE_EVENT, MSG_GDI_WRONG_GDI_SSISS, packet->host, packet->commproc, (int)(packet->id), client_version, feature_get_product_name(FS_VERSION, &ds))); } else { WARNING((SGE_EVENT, MSG_GDI_WRONG_GDI_SSIUS, packet->host, packet->commproc, (int)(packet->id), sge_u32c(version), feature_get_product_name(FS_VERSION, &ds))); } answer_list_add(alpp, SGE_EVENT, STATUS_EVERSION, ANSWER_QUALITY_ERROR); ret = false; } DRETURN(ret); }
/*-------------------------------------------------------------------------*/ static void set_TimeInput( Widget w, XtPointer address, XrmQuark type, Cardinal size ) { int value = 0; String str = NULL; dstring ds; char buffer[128]; sge_dstring_init(&ds, buffer, sizeof(buffer)); if (type != QmonQCardinal ) { XmtWarningMsg("XmtDialogSetDialogValues", "TimeInput", "Type Mismatch: Widget '%s':\n\tCan't set widget values" " from a resource of type '%s'", XtName(w), XrmQuarkToString(type)); return; } if (size == sizeof(Cardinal)) value = *(Cardinal*) address; else return; if (value != 0 && sge_at_time(value, &ds)!=NULL) str = buffer; if (str) XmtInputFieldSetString(w, str); else XmtInputFieldSetString(w, ""); }
static bool add_job(int job_id) { bool write_ok; lListElem *job; lList *answer_list = NULL; lList *master_job_list = *object_type_get_master_list(SGE_TYPE_JOB); const char *key; dstring key_dstring; char key_buffer[100]; sge_dstring_init(&key_dstring, key_buffer, sizeof(key_buffer)); job = lAddElemUlong(&master_job_list, JB_job_number, job_id, JB_Type); key = job_get_key(job_id, 0, NULL, &key_dstring); #if LOCAL_TRANSACTION spool_transaction(&answer_list, spool_get_default_context(), STC_begin); answer_list_output(&answer_list); #endif write_ok = spool_write_object(&answer_list, spool_get_default_context(), job, key, SGE_TYPE_JOB, false); answer_list_output(&answer_list); if (delay > 0) { sge_usleep(delay * 1000); } #if LOCAL_TRANSACTION spool_transaction(&answer_list, spool_get_default_context(), write_ok ? STC_commit : STC_rollback); answer_list_output(&answer_list); #endif return write_ok; }
static sge_callback_result print_jatask_event(sge_evc_class_t *evc, object_description *object_base, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { char buffer[1024]; dstring buffer_wrapper; DENTER(TOP_LAYER, "print_jatask_event"); sge_dstring_init(&buffer_wrapper, buffer, sizeof(buffer)); DPRINTF(("%s\n", event_text(event, &buffer_wrapper))); if (lGetPosViaElem(event, ET_type, SGE_NO_ABORT) >= 0) { u_long32 type = lGetUlong(event, ET_type); u_long32 timestamp = lGetUlong(event, ET_timestamp); if (type == sgeE_JATASK_MOD) { lList *jat = lGetList(event,ET_new_version); u_long job_id = lGetUlong(event, ET_intkey); u_long task_id = lGetUlong(event, ET_intkey2); lListElem *ep = lFirst(jat); u_long job_status = lGetUlong(ep, JAT_status); int task_running = (job_status==JRUNNING || job_status==JTRANSFERING); if (task_running) { fprintf(stdout,"JOB_START (%ld.%ld:ECL_TIME="sge_U32CFormat")\n", job_id ,task_id,sge_u32c(timestamp)); fflush(stdout); Global_jobs_running++; } } if (type == sgeE_JOB_FINAL_USAGE) { /* lList *jat = lGetList(event,ET_new_version); */ u_long job_id = lGetUlong(event, ET_intkey); u_long task_id = lGetUlong(event, ET_intkey2); /* lWriteElemTo(event, stdout); */ fprintf(stdout,"JOB_FINISH (%ld.%ld:ECL_TIME="sge_U32CFormat")\n", job_id, task_id,sge_u32c(timestamp)); Global_jobs_running--; fflush(stdout); } if (type == sgeE_JOB_ADD) { lList *jat = lGetList(event,ET_new_version); u_long job_id = lGetUlong(event, ET_intkey); u_long task_id = lGetUlong(event, ET_intkey2); lListElem *ep = lFirst(jat); const char* job_project = lGetString(ep, JB_project); if (job_project == NULL) { job_project = "NONE"; } fprintf(stdout,"JOB_ADD (%ld.%ld:ECL_TIME="sge_U32CFormat":project=%s)\n", job_id, task_id, sge_u32c(timestamp),job_project); Global_jobs_registered++; fflush(stdout); } if (type == sgeE_JOB_DEL) { u_long job_id = lGetUlong(event, ET_intkey); u_long task_id = lGetUlong(event, ET_intkey2); fprintf(stdout,"JOB_DEL (%ld.%ld:ECL_TIME="sge_U32CFormat")\n", job_id, task_id,sge_u32c(timestamp)); Global_jobs_registered--; fflush(stdout); } } /* create a callback error to test error handling */ if(type == SGE_TYPE_GLOBAL_CONFIG) { DEXIT; return SGE_EMA_FAILURE; } DEXIT; return SGE_EMA_OK; }
/****** shepherd_trace ******************************************************** * NAME * shepherd_trace() -- Write line to trace file. * * SYNOPSIS * int shepherd_trace(const char *format, ...) * * FUNCTION * Writes a line to the trace file, preceding it with a date, time, uid * and pid stamp. * * INPUTS * format: The format string of the line to be written to the error file. * ...: The parameters to the format string. See printf(3c). * * RESULT * int - 0 if successful, 1 if an error occured. *******************************************************************************/ int shepherd_trace(const char *format, ...) { int ret = 1, old_cancelstate; struct stat statbuf; /* Protect the trace file pointer with a mutex */ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &old_cancelstate); pthread_mutex_lock(&g_trace_mutex); /* File was closed (e.g. by an exec()) but fp was not set to NULL */ if (shepherd_trace_fp && fstat(fileno(shepherd_trace_fp), &statbuf) == -1 && errno == EBADF) { shepherd_trace_fp = NULL; } if (shepherd_trace_fp == NULL) { shepherd_trace_fp = shepherd_trace_init_intern(st_trace); } if (shepherd_trace_fp != NULL) { char buffer[128], header_str[256]; dstring ds, message = DSTRING_INIT; sge_dstring_init(&ds, buffer, sizeof(buffer)); sprintf(header_str, "%s ["uid_t_fmt":"pid_t_fmt"]: ", sge_ctime(0, &ds), geteuid(), getpid()); if (format != NULL) { va_list ap; va_start(ap, format); sge_dstring_vsprintf(&message, format, ap); va_end(ap); ret = sh_str2file(header_str, sge_dstring_get_string(&message), shepherd_trace_fp); if (foreground) { printf("%s%s\n", header_str, sge_dstring_get_string(&message)); fflush(stdout); } sge_dstring_free(&message); } /* There are cases where we have to open and close the files * for every write. */ if (!g_keep_files_open) { shepherd_trace_exit(); } ret=0; } pthread_mutex_unlock(&g_trace_mutex); pthread_setcancelstate(old_cancelstate, NULL); return ret; }
/****** shepherd_error ******************************************************** * NAME * shepherd_error() -- Write a line to the error file and exit program. * * SYNOPSIS * void shepherd_error(bool do_exit, const char *format, ...) * * FUNCTION * Writes a line to the error file, preceding it with a * date, time, uid and pid stamp, and exits the program. stops execution. * * INPUTS * do_exit: If true, this function calls exit(2). * format: The format string of the line to be written to the error file. * ...: The parameters to the format string. See printf(3c). * * RESULT * void - none *******************************************************************************/ void shepherd_error(int do_exit, const char *format, ...) { dstring ds; dstring message = DSTRING_INIT; char buffer[128]; char header_str[256]; struct stat statbuf; if (format != NULL) { va_list ap; va_start(ap, format); sge_dstring_vsprintf(&message, format, ap); va_end(ap); } shepherd_trace(sge_dstring_get_string(&message)); /* File was closed (e.g. by an exec()) but fp was not set to NULL */ if (shepherd_error_fp && fstat(fileno(shepherd_error_fp), &statbuf) == -1 && errno==EBADF) { shepherd_error_fp = NULL; } if (shepherd_error_fp == NULL) { shepherd_error_fp = shepherd_trace_init_intern(st_error); } if (shepherd_error_fp != NULL) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sprintf(header_str, "%s ["uid_t_fmt":"pid_t_fmt"]: ", sge_ctime(0, &ds), geteuid(), getpid()); sh_str2file(header_str, sge_dstring_get_string(&message), shepherd_error_fp); } if (foreground) { fprintf(stderr, "%s%s\n", header_str, sge_dstring_get_string(&message)); } /* File was closed (e.g. by an exec()) but fp was not set to NULL */ if (shepherd_exit_status_fp && fstat(fileno(shepherd_exit_status_fp), &statbuf) == -1 && errno==EBADF ) { shepherd_exit_status_fp = NULL; } if (shepherd_exit_status_fp == NULL) { shepherd_exit_status_fp = shepherd_trace_init_intern(st_exit_status); } if (shepherd_exit_status_fp != NULL) { sprintf(header_str, "%d", shepherd_state); sh_str2file(header_str, NULL, shepherd_exit_status_fp); } if (coshepherd_pid > 0) { sge_switch2start_user(); kill(coshepherd_pid, SIGTERM); sge_switch2admin_user(); } if (g_new_interactive_job_support == false && search_conf_val("qrsh_control_port") != NULL) { char buffer[1024]; snprintf(buffer, sizeof(buffer), "1:%s", sge_dstring_get_string(&message)); write_to_qrsh(buffer); } sge_dstring_free(&message); if (do_exit) { /* close all trace files before exit */ shepherd_trace_exit(); exit(shepherd_state); } /* There are cases where we have to open and close the files * for every write. */ if (!g_keep_files_open) { shepherd_error_exit(); } }
/*-----------------------------------------------------------------*/ static int sh_str2file(const char *header_str, const char *str, FILE* fp) { int ret = 1; int ret_fp = -1; int ret_fl = EOF; dstring ds; char buffer[128]; uid_t old_euid = SGE_SUPERUSER_UID; if (fp) { /* * Work around for CR 6293411: * See shepherd_trace_exit() for details. */ if (getuid() == SGE_SUPERUSER_UID) { old_euid = geteuid(); seteuid(SGE_SUPERUSER_UID); } if (!str && !header_str) { ret_fp = fprintf(fp, "function sh_str2file() called with " "NULL arguments\n"); } else if (!header_str && str) { ret_fp = fprintf(fp, "%s\n", str); } else if (header_str && !str) { ret_fp = fprintf(fp, "%s\n", header_str); } else { ret_fp = fprintf(fp, "%s%s\n", header_str, str); } if (ret_fp >= 0) { ret_fl = fflush(fp); if (ret_fl == 0) { ret = 0; } } /* * Switch back to admin user? */ if (old_euid != SGE_SUPERUSER_UID) { seteuid(old_euid); old_euid = SGE_SUPERUSER_UID; } /* * PANIC! Can't write to trace/error/exit_status file! */ if (ret_fp < 0) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sge_dstring_sprintf(&ds, "fprintf(%x,%s,%s) failed: %s", fp, header_str?header_str:"<null>", str?str:"<null>", strerror(errno)); shepherd_panic(buffer); } if(ret_fl != 0) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sge_dstring_sprintf(&ds, "fflush(%x) failed: %s", fp, strerror(errno)); shepherd_panic(buffer); } } return ret; }
/****** uti/monitor/sge_monitor_status() *************************************** * NAME * sge_monitor_status() -- generates the status for qping / commlib * * SYNOPSIS * u_long32 sge_monitor_status(char **info_message, u_long32 monitor_time) * * FUNCTION * This method creats the health monitoring output and returns the monitoring * info to the commlib. * * INPUTS * char **info_message - info_message pointer, has to point to a NULL string * u_long32 monitor_time - the configured monitoring interval * * RESULT * u_long32 - 0 : everything is okay * 1 : warning * 2 : error * 3 : init problems * * NOTES * MT-NOTE: sge_monitor_status() is MT safe * *******************************************************************************/ u_long32 sge_monitor_status(char **info_message, u_long32 monitor_time) { u_long32 ret = 0; char date[40]; dstring ddate; DENTER(GDI_LAYER, "sge_monitor_status"); if (info_message == NULL) { DEXIT; return 3; } sge_dstring_init(&ddate, date, sizeof(date)); sge_mutex_lock("sge_monitor_status", SGE_FUNC, __LINE__, &global_mutex); sge_dstring_clear(&Info_Line); {/* this is the qping info section, it checks if each thread is still alive */ int i; int error_count = 0; struct timeval now; double time; char state = 'R'; gettimeofday(&now,NULL); for (i = 0; i < MAX_OUTPUT_LINES; i++) { sge_mutex_lock("sge_monitor_status", SGE_FUNC, __LINE__, &(Output[i].Output_Mutex)); if (Output[i].name != NULL) { time = now.tv_usec - Output[i].last_wait_time.tv_usec; time = now.tv_sec - Output[i].last_wait_time.tv_sec + (time /1000000); if (Output[i].warning_timeout != NO_WARNING) { if (Output[i].warning_timeout < time) { if (Output[i].error_timeout < time) { state = 'E'; } else { state = 'W'; } error_count++; } } sge_dstring_sprintf_append(&Info_Line, MSG_UTI_MONITOR_INFO_SCF, Output[i].name, state, time); } sge_mutex_unlock("sge_monitor_status", SGE_FUNC, __LINE__, &(Output[i].Output_Mutex)); } if (error_count == 0) { sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_OK); } else if (error_count == 1) { ret = 1; sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_WARNING); } else { ret = 2; sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_ERROR); } sge_dstring_append(&Info_Line, "\n"); } #if defined(LINUX) || defined(AIX43) || defined(AIX51) || defined(IRIX) || defined(SOLARIS) || defined(HP11) if (mallinfo_func_pointer != NULL) { struct mallinfo mallinfo_data = mallinfo_func_pointer(); sge_dstring_sprintf_append(&Info_Line, MSG_UTI_MONITOR_SCHEXT_UUUUUUUUUU, mallinfo_data.arena, mallinfo_data.ordblks, mallinfo_data.smblks, mallinfo_data.hblks, mallinfo_data.hblkhd, mallinfo_data.usmblks, mallinfo_data.fsmblks, mallinfo_data.uordblks, mallinfo_data.fordblks, mallinfo_data.keepcost); sge_dstring_append(&Info_Line, "\n"); } #endif if (monitor_time != 0) { /* generates the output monitoring output data */ int i; sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_COLON); sge_dstring_append(&Info_Line, "\n"); for (i = 0; i < MAX_OUTPUT_LINES; i++) { sge_mutex_lock("sge_monitor_status", SGE_FUNC, __LINE__, &(Output[i].Output_Mutex)); if (Output[i].name != NULL) { append_time(Output[i].update_time, &Info_Line, false); sge_dstring_append(&Info_Line, " | "); sge_dstring_append_dstring(&Info_Line, Output[i].output); sge_dstring_append(&Info_Line,"\n"); } sge_mutex_unlock("sge_monitor_status", SGE_FUNC, __LINE__, &(Output[i].Output_Mutex)); } } else { sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_DISABLED); sge_dstring_append(&Info_Line, "\n"); } *info_message = strdup(sge_dstring_get_string(&Info_Line)); sge_mutex_unlock("sge_monitor_status", SGE_FUNC, __LINE__, &global_mutex); DEXIT; return ret; }
/****** Eventmirror/pe_task/pe_task_update_master_list() *********************** * NAME * pe_task_update_master_list() -- update parallel tasks of an array task * * SYNOPSIS * bool * pe_task_update_master_list(sge_object_type type, sge_event_action action, * lListElem *event, void *clientdata) * * FUNCTION * Update the list of parallel tasks of an array task * based on an event. * The function is called from the event mirroring interface. * * The scaled usage list of a parallel task is not updated * by this function, as this data is maintained by a * separate event. * * INPUTS * sge_object_type type - event type * sge_event_action action - action to perform * lListElem *event - the raw event * void *clientdata - client data * * RESULT * bool - true, if update is successfull, else false * * NOTES * The function should only be called from the event mirror interface. * * SEE ALSO * Eventmirror/--Eventmirror * Eventmirror/sge_mirror_update_master_list() *******************************************************************************/ sge_callback_result pe_task_update_master_list(sge_evc_class_t *evc, object_description *object_base, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { u_long32 job_id; lListElem *job = NULL; const char *pe_task_id = NULL; lListElem *pe_task = NULL; u_long32 ja_task_id; lListElem *ja_task = NULL; lList *pe_task_list = NULL; const lDescr *pe_task_descr = NULL; lList *usage = NULL; char id_buffer[MAX_STRING_SIZE]; dstring id_dstring; DENTER(TOP_LAYER, "pe_task_update_master_list"); sge_dstring_init(&id_dstring, id_buffer, MAX_STRING_SIZE); job_id = lGetUlong(event, ET_intkey); ja_task_id = lGetUlong(event, ET_intkey2); pe_task_id = lGetString(event, ET_strkey); job = job_list_locate(*sge_master_list(object_base, SGE_TYPE_JOB), job_id); if (job == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDJOBFORUPDATEIN_SS, job_get_id_string(job_id, 0, NULL, &id_dstring), SGE_FUNC)); DEXIT; return SGE_EMA_FAILURE; } ja_task = job_search_task(job, NULL, ja_task_id); if (ja_task == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDJATASKFORUPDATEIN_SS, job_get_id_string(job_id, ja_task_id, NULL, &id_dstring), SGE_FUNC)); DEXIT; return SGE_EMA_FAILURE; } pe_task = ja_task_search_pe_task(ja_task, pe_task_id); pe_task_list = lGetList(ja_task, JAT_task_list); pe_task_descr = lGetListDescr(lGetList(event, ET_new_version)); if (action == SGE_EMA_MOD) { /* modify event for pe_task. * we may not update * - PET_scaled_usage - it is maintained by JOB_USAGE events */ if (pe_task == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDPETASKFORUPDATEIN_SS, job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), SGE_FUNC)); DEXIT; return SGE_EMA_FAILURE; } lXchgList(pe_task, PET_scaled_usage, &usage); } if (sge_mirror_update_master_list(&pe_task_list, pe_task_descr, pe_task, job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), action, event) != SGE_EM_OK) { lFreeList(&usage); DEXIT; return SGE_EMA_FAILURE; } /* restore pe_task list after modify event */ if (action == SGE_EMA_MOD) { pe_task = ja_task_search_pe_task(ja_task, pe_task_id); if (pe_task == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDPETASKFORUPDATEIN_SS, job_get_id_string(job_id, ja_task_id, pe_task_id, &id_dstring), SGE_FUNC)); lFreeList(&usage); DEXIT; return SGE_EMA_FAILURE; } lXchgList(pe_task, PET_scaled_usage, &usage); lFreeList(&usage); } /* first petask add event could have created new pe_task list for job */ if (lGetList(ja_task, JAT_task_list) == NULL && pe_task_list != NULL) { lSetList(ja_task, JAT_task_list, pe_task_list); } DEXIT; return SGE_EMA_OK; }
/****** err_trace/shepherd_trace_init_intern() ******************************* * NAME * shepherd_trace_init_intern() -- Initialize shepherd's tracing. * * SYNOPSIS * static FILE* shepherd_trace_init(char *trace_file_path, * char *trace_file_name) * * FUNCTION * Opens the shepherd's trace file and sets the FD_CLOEXEC-flag so it will * be closed automatically in an exec()-call. * Must be called with euid=admin user to work properly! * * INPUTS * char *trace_file_path - either the whole path of the trace file (including * the file itself) * or NULL to retrieve the file pointer of an already * opened trace file. * char *trace_file_name - the name of the trace file itself. Ignored when * *trace_file_path is NULL. * * RESULT * FILE* - If successfully opened, the file pointer of shepherd's trace file. * - Otherwise NULL. *******************************************************************************/ static FILE* shepherd_trace_init_intern(st_shepherd_file_t shepherd_file) { static char path[SGE_PATH_MAX]; static bool called = false; SGE_STRUCT_STAT statbuf; dstring ds; char buffer[SGE_PATH_MAX+128]; char tmppath[SGE_PATH_MAX]; int fd = -1; FILE *fp = NULL; int do_chown = 0; /* * after changing into the jobs cwd we need an * absolute path to the error/trace file */ if (called == false) { getcwd(path, sizeof(path)); called=true; } snprintf(tmppath, SGE_PATH_MAX,"%s/%s",path, g_shepherd_file_name[shepherd_file]); sge_strlcpy(g_shepherd_file_path[shepherd_file], tmppath, SGE_PATH_MAX); /* If the file does not exist, create it. Otherwise just open it. */ if (SGE_STAT(tmppath, &statbuf)) { fd = SGE_OPEN3(tmppath, O_RDWR | O_CREAT | O_APPEND, 0644); if (fd<0) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sge_dstring_sprintf(&ds, "creat(%s) failed: %s", tmppath, strerror(errno)); shepherd_panic(buffer); } if (getuid() == SGE_SUPERUSER_UID) { /* We must give the file to the job owner later */ do_chown = 1; } else { /* We are not root, so we have to own all files anyway. */ do_chown = 0; } } else { /* The file already exists. We get here when * a) a exec() failed or * b) after the execution of prolog/job, when the job/epilog * tries to init the error/exit status files. * * In a root system we can just open the file, because we are either * root or the job user who owns the file. * In a admin user system we must set our euid to root to open it, then * it is the same as the root system. * In a test user system we are the owner of the file and can open it. * * When we are root (masked or not), we gave this file to the * prolog user/job user right after its creation. But we can have * 3 different users for prolog, job and epilog, so we must give * the file here to the next user. * This must be done via shepherd_trace_chown() in the shepherd * before we switch to this user there. * It can't be done here because we don't know if we are in * case a) (exec failed) or case b) (after execution of prolog/job). */ int old_euid = SGE_SUPERUSER_UID; /* * Work around for CR 6293411: * See shepherd_trace_exit() for details. */ if (getuid() == SGE_SUPERUSER_UID) { old_euid = geteuid(); seteuid(SGE_SUPERUSER_UID); } fd = SGE_OPEN2(tmppath, O_RDWR | O_APPEND); if (fd<0) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sge_dstring_sprintf(&ds, "open(%s) failed: %s", tmppath, strerror(errno)); shepherd_panic(buffer); } do_chown = 0; /* * Switch back to admin user? */ if (old_euid != SGE_SUPERUSER_UID) { seteuid(old_euid); } } /* Something went wrong. */ if (fd<0) { return NULL; } /* To avoid to block stdin, stdout or stderr, dup the fd until it is >= 3 */ if (fd<3) { dup_fd(&fd); } /* Set FD_CLOEXEC flag to automatically close the file in an exec() */ if (!set_cloexec(fd)) { shepherd_panic("set_cloexec() failed"); return NULL; } /* * Now open a FILE* from the file descriptor, so we can use fprintf(). */ fp = fdopen(fd, "a"); if (!fp) { sge_dstring_init(&ds, buffer, sizeof(buffer)); sge_dstring_sprintf(&ds, "can't open %s file \"%s\": %s\n", g_shepherd_file_name[shepherd_file], tmppath, strerror(errno)); shepherd_panic(buffer); return NULL; } if (do_chown && strlen(g_job_owner) > 0) { shepherd_trace_chown_intern(g_job_owner, fp, shepherd_file); } return fp; }
/****** sge_select_queue/get_attribute() *************************************** * NAME * get_attribute() -- looks for an attribut, but only for one level (for host, global, or queue) * * SYNOPSIS * static lListElem* get_attribute(const char *attrname, lList *config_attr, * lList *actual_attr, lList *load_attr, lList *centry_list, lListElem * *queue, lListElem *rep, u_long32 layer, double lc_factor, dstring *reason) * * FUNCTION * Extracts the attribut specified with 'attrname' and finds the * more important one, if it is defined multiple times on the same * level. It only cares about one level. * If the attribute is a consumable, one can specify a point in time and a duration. * This will get the caller the min amount of that resource during the time frame. * * INPUTS * const char *attrname - attribute name one is looking for * lList *config_attr - user defined attributes (CE_Type) * lList *actual_attr - current usage of consumables (RUE_Type) * lList *load_attr - load attributes * lList *centry_list - the system wide attribute configuration * lListElem *queue - the current queue, or null, if one works on hosts * u_long32 layer - the current layer * double lc_factor - the load correction value * dstring *reason - space for error messages or NULL * bool zero_utilization - ??? * u_long32 start_time - begin of the time interval, one asks for the resource * u_long32 duration - the duration the interval * * RESULT * static lListElem* - the element one was looking for or NULL * *******************************************************************************/ lListElem* get_attribute(const char *attrname, lList *config_attr, lList *actual_attr, lList *load_attr, const lList *centry_list, lListElem *queue, u_long32 layer, double lc_factor, dstring *reason, bool zero_utilization, u_long32 start_time, u_long32 duration) { lListElem *actual_el=NULL; lListElem *load_el=NULL; lListElem *cplx_el=NULL; DENTER(BASIS_LAYER, "get_attribute"); /* resource_attr is a complex_entry (CE_Type) */ if (config_attr) { lListElem *temp = lGetElemStr(config_attr, CE_name, attrname); if (temp){ cplx_el = lCopyElem(lGetElemStr(centry_list, CE_name, attrname)); if(!cplx_el){ /* error */ DRETURN(NULL); } lSetUlong(cplx_el, CE_dominant, layer | DOMINANT_TYPE_FIXED); lSetUlong(cplx_el, CE_pj_dominant, DOMINANT_TYPE_VALUE); /* default, no value set */ lSetDouble(cplx_el, CE_doubleval, lGetDouble(temp,CE_doubleval) ); lSetString(cplx_el, CE_stringval, lGetString(temp,CE_stringval) ); } } if (cplx_el && lGetUlong(cplx_el, CE_consumable) != CONSUMABLE_NO) { lSetUlong(cplx_el, CE_pj_dominant, layer | DOMINANT_TYPE_CONSUMABLE); lSetUlong(cplx_el, CE_dominant, DOMINANT_TYPE_VALUE); /* treat also consumables as fixed attributes when assuming an empty queuing system */ if (sconf_get_qs_state() == QS_STATE_FULL) { if (actual_attr && (actual_el = lGetElemStr(actual_attr, RUE_name, attrname))){ dstring ds; char as_str[20]; double utilized = zero_utilization ? 0 : utilization_max(actual_el, start_time, duration, false); switch (lGetUlong(cplx_el, CE_relop)) { case CMPLXGE_OP: case CMPLXGT_OP: lSetDouble(cplx_el, CE_pj_doubleval, utilized); break; case CMPLXEQ_OP: case CMPLXLT_OP: case CMPLXLE_OP: case CMPLXNE_OP: default: lSetDouble(cplx_el, CE_pj_doubleval, lGetDouble(cplx_el, CE_doubleval) - utilized); break; } sge_dstring_init(&ds, as_str, sizeof(as_str)); sge_dstring_sprintf(&ds, "%8.3f", (float)lGetDouble(cplx_el, CE_pj_doubleval)); lSetString(cplx_el,CE_pj_stringval, as_str); } else{ sge_dstring_sprintf(reason, MSG_ATTRIB_ACTUALELEMENTTOATTRIBXMISSING_S, attrname); lFreeElem(&cplx_el); DRETURN(NULL); } } else{ lSetDouble(cplx_el, CE_pj_doubleval, lGetDouble(cplx_el, CE_doubleval)); lSetString(cplx_el,CE_pj_stringval, lGetString(cplx_el, CE_stringval)); } } /** check for a load value */ if (load_attr && (load_el = lGetElemStr(load_attr, HL_name, attrname)) && (sconf_get_qs_state()==QS_STATE_FULL || lGetBool(load_el, HL_static)) && (!is_attr_prior(cplx_el, cplx_el))) { lListElem *ep_nproc=NULL; int nproc=1; if (!cplx_el){ cplx_el = lCopyElem(lGetElemStr(centry_list, CE_name, attrname)); if (!cplx_el){ /* error */ DRETURN(NULL); } lSetUlong(cplx_el, CE_dominant, DOMINANT_TYPE_VALUE); lSetUlong(cplx_el, CE_pj_dominant, DOMINANT_TYPE_VALUE); } if ((ep_nproc = lGetElemStr(load_attr, HL_name, LOAD_ATTR_NUM_PROC))) { const char *cp = lGetString(ep_nproc, HL_value); if (cp) nproc = MAX(1, atoi(lGetString(ep_nproc, HL_value))); } { const char *load_value=NULL; u_long32 type; double dval; load_value = lGetString(load_el, HL_value); /* are we working on string values? if though, than it is easy */ if ( (type = lGetUlong(cplx_el, CE_valtype)) == TYPE_STR || type == TYPE_CSTR || type == TYPE_HOST || type == TYPE_RESTR) { lSetString(cplx_el, CE_stringval, load_value); lSetUlong(cplx_el, CE_dominant, layer | DOMINANT_TYPE_LOAD); } else { /* working on numerical values */ lListElem *job_load; char err_str[256]; char sval[100]; u_long32 dom_type = DOMINANT_TYPE_LOAD; lList *load_adjustments = sconf_get_job_load_adjustments(); job_load=lGetElemStr(load_adjustments, CE_name, attrname); if (parse_ulong_val(&dval, NULL, type, load_value, NULL, 0)) { sge_strlcpy(sval, load_value, 100); /* -------------------------------- look for 'name' in our load_adjustments list */ if (job_load) { const char *s; double load_correction; s = lGetString(job_load, CE_stringval); if (!parse_ulong_val(&load_correction, NULL, type, s, err_str, 255)) { ERROR((SGE_EVENT, MSG_SCHEDD_LOADADJUSTMENTSVALUEXNOTNUMERIC_S , attrname)); } else if (lc_factor) { double old_dval; u_long32 relop; if (!strncmp(attrname, "np_", 3) && nproc != 1 ) { DPRINTF(("fillComplexFromHost: dividing lc_factor for \"%s\" with value %f by %d to %f\n", attrname, lc_factor, nproc, lc_factor / nproc)); lc_factor /= nproc; } load_correction *= lc_factor; /* it depends on relop in complex config whether load_correction is pos/neg */ if ( (relop = lGetUlong(cplx_el, CE_relop)) == CMPLXGE_OP || relop == CMPLXGT_OP){ old_dval = dval; dval += load_correction; } else{ old_dval = dval; dval -= load_correction; } sprintf(sval, "%8.3f", dval); DPRINTF(("%s: uc: %f c(%f): %f\n", attrname, old_dval, lc_factor, dval)); dom_type = DOMINANT_TYPE_CLOAD; } } /* we can have a user, who wants to override the incomming load value. This is no problem for consumables, but for fixed values. A custom fixed value is a per slot value (stored in CE_doubleval) and a load value is a per job value (stored in CE_pj_doubleval). This code changes a fixed custom value from a per slot to a per job value!! */ if ( !(lGetUlong(cplx_el, CE_dominant) == DOMINANT_TYPE_VALUE) && (lGetUlong(cplx_el, CE_pj_dominant) == DOMINANT_TYPE_VALUE)){ lSetDouble(cplx_el, CE_pj_doubleval, lGetDouble(cplx_el, CE_doubleval)); lSetString(cplx_el, CE_pj_stringval, lGetString(cplx_el, CE_stringval)); lSetUlong(cplx_el, CE_dominant, DOMINANT_TYPE_VALUE); lSetUlong(cplx_el, CE_pj_dominant, layer | DOMINANT_TYPE_FIXED); } if (!is_attr_prior2(cplx_el, dval, CE_pj_doubleval, CE_pj_dominant)){ lSetString(cplx_el, CE_pj_stringval, load_value); lSetUlong(cplx_el, CE_pj_dominant, layer | dom_type); lSetDouble(cplx_el, CE_pj_doubleval, dval ); } } /* end numerical load value */ lFreeList(&load_adjustments); }/* end block */ } } /* we are working on queue level, so we have to check for queue resource values */ if (queue){ bool created=false; if(!cplx_el){ cplx_el = lCopyElem(lGetElemStr(centry_list, CE_name, attrname)); if(!cplx_el){ /* error */ DRETURN(NULL); } lSetUlong(cplx_el, CE_dominant, DOMINANT_TYPE_VALUE); lSetUlong(cplx_el, CE_pj_dominant, DOMINANT_TYPE_VALUE); created = true; } if (!get_queue_resource(cplx_el, queue, attrname) && created) { lFreeElem(&cplx_el); } } DRETURN(cplx_el); }
/**** **** qstat_usage (static) **** **** displays usage of qstat on file fp. **** Is what NULL, full usage will be displayed. **** **** Returns always 1. **** **** If what is a pointer to an option-string, **** only usage for that option will be displayed. **** ** not implemented yet! ** ****/ int qstat_usage(int qselect_mode, FILE *fp, char *what) { dstring ds; char buffer[256]; sge_dstring_init(&ds, buffer, sizeof(buffer)); fprintf(fp, "%s\n", feature_get_product_name(FS_SHORT_VERSION, &ds)); if(!what) { /* display full usage */ fprintf(fp, "%s %s [options]\n", MSG_SRC_USAGE ,qselect_mode?"qselect":"qstat"); if (!qselect_mode) { fprintf(fp, " [-ext] %s\n",MSG_QSTAT_USAGE_VIEWALSOSCHEDULINGATTRIBUTES); } if (!qselect_mode) { fprintf(fp, " [-explain a|c|A|E] %s\n",MSG_QSTAT_USAGE_EXPLAINOPT); } if (!qselect_mode) fprintf(fp, " [-f] %s\n",MSG_QSTAT_USAGE_FULLOUTPUT); if (!qselect_mode) fprintf(fp, " [-F [resource_attributes]] %s\n",MSG_QSTAT_USAGE_FULLOUTPUTANDSHOWRESOURCESOFQUEUES); if (!qselect_mode) { fprintf(fp, " [-g {c}] %s\n",MSG_QSTAT_USAGE_DISPLAYCQUEUESUMMARY); fprintf(fp, " [-g {d}] %s\n",MSG_QSTAT_USAGE_DISPLAYALLJOBARRAYTASKS); fprintf(fp, " [-g {t}] %s\n",MSG_QSTAT_USAGE_DISPLAYALLPARALLELJOBTASKS); } fprintf(fp, " [-help] %s\n",MSG_COMMON_help_OPT_USAGE); if (!qselect_mode) fprintf(fp, " [-j job_identifier_list ] %s\n",MSG_QSTAT_USAGE_SHOWSCHEDULERJOBINFO); fprintf(fp, " [-l resource_list] %s\n",MSG_QSTAT_USAGE_REQUESTTHEGIVENRESOURCES); if (!qselect_mode) fprintf(fp, " [-ne] %s\n",MSG_QSTAT_USAGE_HIDEEMPTYQUEUES); if (!qselect_mode) { fprintf(fp, " [-ncb] %s\n",MSG_QSTAT_USAGE_VIEWALSOBINDINGATTRIBUTES); } fprintf(fp, " [-pe pe_list] %s\n",MSG_QSTAT_USAGE_SELECTONLYQUEESWITHONOFTHESEPE); fprintf(fp, " [-q wc_queue_list] %s\n",MSG_QSTAT_USAGE_PRINTINFOONGIVENQUEUE); fprintf(fp, " [-qs {a|c|d|o|s|u|A|C|D|E|S}] %s\n",MSG_QSTAT_USAGE_PRINTINFOCQUEUESTATESEL); if (!qselect_mode) fprintf(fp, " [-r] %s\n",MSG_QSTAT_USAGE_SHOWREQUESTEDRESOURCESOFJOB); if (!qselect_mode) { fprintf(fp, " [-s {p|r|s|z|hu|ho|hs|hd|hj|ha|h|a}] %s\n",MSG_QSTAT_USAGE_SHOWPENDINGRUNNINGSUSPENDESZOMBIEJOBS); fprintf(fp, " %s\n",MSG_QSTAT_USAGE_JOBSWITHAUSEROPERATORSYSTEMHOLD); fprintf(fp, " %s\n",MSG_QSTAT_USAGE_JOBSWITHSTARTTIMEINFUTORE); fprintf(fp, " %s\n",MSG_QSTAT_USAGE_HISABBREVIATIONFORHUHOHSHJHA); fprintf(fp, " %s\n",MSG_QSTAT_USAGE_AISABBREVIATIONFOR); } if (!qselect_mode) fprintf(fp, " [-t] %s\n",MSG_QSTAT_USAGE_SHOWTASKINFO); if (!qselect_mode){ fprintf(fp, " [-u user_list] %s\n",MSG_QSTAT_USAGE_VIEWONLYJOBSOFTHISUSER); } fprintf(fp, " [-U user_list] %s\n",MSG_QSTAT_USAGE_SELECTQUEUESWHEREUSERXHAVEACCESS); if (!qselect_mode) { fprintf(fp, " [-urg] %s\n",MSG_QSTAT_URGENCYINFO ); fprintf(fp, " [-pri] %s\n",MSG_QSTAT_PRIORITYINFO ); fprintf(fp, " [-xml] %s\n", MSG_COMMON_xml_OPT_USAGE); } if (getenv("MORE_INFO")) { fprintf(fp, SFNMAX, MSG_QSTAT_USAGE_ADDITIONALDEBUGGINGOPTIONS); fprintf(fp, " [-dj] %s\n",MSG_QSTAT_USAGE_DUMPCOMPLETEJOBLISTTOSTDOUT); fprintf(fp, " [-dq] %s\n",MSG_QSTAT_USAGE_DUMPCOMPLETEQUEUELISTTOSTDOUT); } fprintf(fp, "\n"); fprintf(fp, "pe_list pe[,pe,...]\n"); fprintf(fp, "job_identifier_list [job_id|job_name|pattern]{, [job_id|job_name|pattern]}\n"); fprintf(fp, "resource_list resource[=value][,resource[=value],...]\n"); fprintf(fp, "user_list user|@group[,user|@group],...]\n"); fprintf(fp, "resource_attributes resource,resource,...\n"); fprintf(fp, "wc_cqueue %s\n", MSG_QSTAT_HELP_WCCQ); fprintf(fp, "wc_host %s\n", MSG_QSTAT_HELP_WCHOST); fprintf(fp, "wc_hostgroup %s\n", MSG_QSTAT_HELP_WCHG); fprintf(fp, "wc_qinstance wc_cqueue@wc_host\n"); fprintf(fp, "wc_qdomain wc_cqueue@wc_hostgroup\n"); fprintf(fp, "wc_queue wc_cqueue|wc_qdomain|wc_qinstance\n"); fprintf(fp, "wc_queue_list wc_queue[,wc_queue,...]\n"); } else { /* display option usage */ fprintf(fp, MSG_QDEL_not_available_OPT_USAGE_S,what); fprintf(fp, "\n"); } return 1; }
/****** Eventmirror/job/job_update_master_list() ***************************** * NAME * job_update_master_list() -- update the master list of jobs * * SYNOPSIS * bool job_update_master_list(sge_object_type type, * sge_event_action action, * lListElem *event, void *clientdata) * * FUNCTION * Update the global master list of jobs * based on an event. * The function is called from the event mirroring interface. * * A jobs array tasks are not updated by this function, * as they are maintained by separate events. * In addition, some scheduler specific attributes, that * are only used in scheduler, are not updated. * * INPUTS * sge_object_type type - event type * sge_event_action action - action to perform * lListElem *event - the raw event * void *clientdata - client data * * RESULT * bool - true, if update is successfull, else false * * NOTES * The function should only be called from the event mirror interface. * * SEE ALSO * Eventmirror/--Eventmirror * Eventmirror/sge_mirror_update_master_list() * Eventmirror/job/job_update_master_list_usage() *******************************************************************************/ sge_callback_result job_update_master_list(sge_evc_class_t *evc, object_description *object_base, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { lList **list; const lDescr *list_descr; u_long32 job_id; lListElem *job = NULL; lList *ja_tasks = NULL; char id_buffer[MAX_STRING_SIZE]; dstring id_dstring; DENTER(TOP_LAYER, "job_update_master_list"); sge_dstring_init(&id_dstring, id_buffer, MAX_STRING_SIZE); list = sge_master_list(object_base, SGE_TYPE_JOB); list_descr = lGetListDescr(lGetList(event, ET_new_version)); job_id = lGetUlong(event, ET_intkey); job = job_list_locate(*list, job_id); if (action == SGE_EMA_MOD) { u_long32 event_type = lGetUlong(event, ET_type); if (job == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDJOBFORUPDATEIN_SS, job_get_id_string(job_id, 0, NULL, &id_dstring), "job_update_master_list")); DRETURN(SGE_EMA_FAILURE); } if (event_type == sgeE_JOB_USAGE || event_type == sgeE_JOB_FINAL_USAGE ) { /* special handling needed for JOB_USAGE and JOB_FINAL_USAGE events. * they are sent for jobs, ja_tasks and pe_tasks and only contain * the usage list. * Preferable would probably be to send MOD events for the different * object types. */ bool ret = job_update_master_list_usage(*list, event); DRETURN(ret?SGE_EMA_OK:SGE_EMA_FAILURE); } else { /* this is the true modify event. * we may not update several fields: * - JB_ja_tasks is the task list - it is maintained by JATASK events * - JB_host and JB_category are scheduler internal attributes * they may not be overwritten. * Better would be to move them from JB_Type to some scheduler specific * object. */ lListElem *modified_job; modified_job = lFirst(lGetList(event, ET_new_version)); if(job != NULL && modified_job != NULL) { /* we want to preserve the old ja_tasks, since job update events to not contain them */ lXchgList(job, JB_ja_tasks, &ja_tasks); lSetHost(modified_job, JB_host, lGetHost(job, JB_host)); lSetRef(modified_job, JB_category, lGetRef(job, JB_category)); } } } if (sge_mirror_update_master_list(list, list_descr, job, job_get_id_string(job_id, 0, NULL, &id_dstring), action, event) != SGE_EM_OK) { lFreeList(&ja_tasks); DRETURN(SGE_EMA_FAILURE); } /* restore ja_task list after modify event */ if (action == SGE_EMA_MOD && ja_tasks != NULL) { /* we have to search the replaced job */ job = job_list_locate(*list, job_id); if(job == NULL) { ERROR((SGE_EVENT, MSG_JOB_CANTFINDJOBFORUPDATEIN_SS, job_get_id_string(job_id, 0, NULL, &id_dstring), "job_update_master_list")); lFreeList(&ja_tasks); DRETURN(SGE_EMA_FAILURE); } lXchgList(job, JB_ja_tasks, &ja_tasks); lFreeList(&ja_tasks); } DRETURN(SGE_EMA_OK); }
/****** qmaster/threads/sge_scheduler_main() ********************************** * NAME * sge_scheduler_main() -- main function of the scheduler thread * * SYNOPSIS * void * sge_scheduler_main(void *arg) * * FUNCTION * Main function of the scheduler thread, * * INPUTS * void *arg - pointer to the thread function (type cl_thread_settings_t*) * * RESULT * void * - always NULL * * NOTES * MT-NOTE: sge_scheduler_main() is MT safe * * MT-NOTE: this is a thread function. Do NOT use this function * MT-NOTE: in any other way! * * SEE ALSO * qmaster/threads/sge_scheduler_initialize() * qmaster/threads/sge_scheduler_cleanup_thread() * qmaster/threads/sge_scheduler_terminate() * qmaster/threads/sge_scheduler_main() *******************************************************************************/ void * sge_scheduler_main(void *arg) { time_t next_prof_output = 0; monitoring_t monitor; sge_gdi_ctx_class_t *ctx = NULL; sge_evc_class_t *evc = NULL; lList *alp = NULL; sge_where_what_t where_what; cl_thread_settings_t *thread_config = (cl_thread_settings_t*)arg; bool do_shutdown = false; bool do_endlessly = true; bool local_ret = true; DENTER(TOP_LAYER, "sge_scheduler_main"); memset(&where_what, 0, sizeof(where_what)); /* * startup */ if (local_ret) { /* initialize commlib thread */ cl_thread_func_startup(thread_config); /* initialize monitoring */ sge_monitor_init(&monitor, thread_config->thread_name, SCH_EXT, SCT_WARNING, SCT_ERROR); sge_qmaster_thread_init(&ctx, SCHEDD, SCHEDD_THREAD, true); /* register at profiling module */ set_thread_name(pthread_self(), "Scheduler Thread"); conf_update_thread_profiling("Scheduler Thread"); DPRINTF((SFN" started\n", thread_config->thread_name)); /* initialize schedd_runnlog logging */ schedd_set_schedd_log_file(ctx); } /* set profiling parameters */ prof_set_level_name(SGE_PROF_EVENTMASTER, NULL, NULL); prof_set_level_name(SGE_PROF_SPOOLING, NULL, NULL); prof_set_level_name(SGE_PROF_CUSTOM0, "scheduler", NULL); prof_set_level_name(SGE_PROF_CUSTOM1, "pending ticket calculation", NULL); prof_set_level_name(SGE_PROF_CUSTOM3, "job sorting", NULL); prof_set_level_name(SGE_PROF_CUSTOM4, "job dispatching", NULL); prof_set_level_name(SGE_PROF_CUSTOM5, "send orders", NULL); prof_set_level_name(SGE_PROF_CUSTOM6, "scheduler event loop", NULL); prof_set_level_name(SGE_PROF_CUSTOM7, "copy lists", NULL); prof_set_level_name(SGE_PROF_SCHEDLIB4, NULL, NULL); /* set-up needed for 'schedule' file */ serf_init(schedd_serf_record_func, schedd_serf_newline); schedd_set_serf_log_file(ctx); /* * prepare event client/mirror mechanism */ if (local_ret) { local_ret = sge_gdi2_evc_setup(&evc, ctx, EV_ID_SCHEDD, &alp, "scheduler"); DPRINTF(("prepared event client/mirror mechanism\n")); } /* * register as event mirror */ if (local_ret) { sge_mirror_initialize(evc, EV_ID_SCHEDD, "scheduler", false, &event_update_func, &sge_mod_event_client, &sge_add_event_client, &sge_remove_event_client, &sge_handle_event_ack); evc->ec_register(evc, false, NULL, &monitor); evc->ec_set_busy_handling(evc, EV_BUSY_UNTIL_RELEASED); DPRINTF(("registered at event mirror\n")); } /* * subscribe necessary data */ if (local_ret) { ensure_valid_what_and_where(&where_what); subscribe_scheduler(evc, &where_what); DPRINTF(("subscribed necessary data from event master\n")); } /* * schedulers main loop */ if (local_ret) { while (do_endlessly) { bool handled_events = false; lList *event_list = NULL; int execute = 0; double prof_copy = 0.0; double prof_total = 0.0; double prof_init = 0.0; double prof_free = 0.0; double prof_run = 0.0; lList *orders = NULL; if (sconf_get_profiling()) { prof_start(SGE_PROF_OTHER, NULL); prof_start(SGE_PROF_PACKING, NULL); prof_start(SGE_PROF_EVENTCLIENT, NULL); prof_start(SGE_PROF_MIRROR, NULL); prof_start(SGE_PROF_GDI, NULL); prof_start(SGE_PROF_HT_RESIZE, NULL); prof_start(SGE_PROF_CUSTOM0, NULL); prof_start(SGE_PROF_CUSTOM1, NULL); prof_start(SGE_PROF_CUSTOM3, NULL); prof_start(SGE_PROF_CUSTOM4, NULL); prof_start(SGE_PROF_CUSTOM5, NULL); prof_start(SGE_PROF_CUSTOM6, NULL); prof_start(SGE_PROF_CUSTOM7, NULL); prof_start(SGE_PROF_SCHEDLIB4, NULL); } else { prof_stop(SGE_PROF_OTHER, NULL); prof_stop(SGE_PROF_PACKING, NULL); prof_stop(SGE_PROF_EVENTCLIENT, NULL); prof_stop(SGE_PROF_MIRROR, NULL); prof_stop(SGE_PROF_GDI, NULL); prof_stop(SGE_PROF_HT_RESIZE, NULL); prof_stop(SGE_PROF_CUSTOM0, NULL); prof_stop(SGE_PROF_CUSTOM1, NULL); prof_stop(SGE_PROF_CUSTOM3, NULL); prof_stop(SGE_PROF_CUSTOM4, NULL); prof_stop(SGE_PROF_CUSTOM5, NULL); prof_stop(SGE_PROF_CUSTOM6, NULL); prof_stop(SGE_PROF_CUSTOM7, NULL); prof_stop(SGE_PROF_SCHEDLIB4, NULL); } /* * Wait for new events */ MONITOR_IDLE_TIME(sge_scheduler_wait_for_event(evc, &event_list), (&monitor), mconf_get_monitor_time(), mconf_is_monitor_message()); /* If we lost connection we have to register again */ if (evc->ec_need_new_registration(evc)) { lFreeList(&event_list); if (evc->ec_register(evc, false, NULL, &monitor) == true) { DPRINTF(("re-registered at event master!\n")); } } if (event_list != NULL) { /* check for shutdown */ do_shutdown = (lGetElemUlong(event_list, ET_type, sgeE_SHUTDOWN) != NULL) ? true : false; /* update mirror and free data */ if (do_shutdown == false && sge_mirror_process_event_list(evc, event_list) == SGE_EM_OK) { handled_events = true; DPRINTF(("events handled\n")); } else { DPRINTF(("events contain shutdown event - ignoring events\n")); } lFreeList(&event_list); } /* if we actually got events, start the scheduling run and further event processing */ if (handled_events == true) { lList *answer_list = NULL; scheduler_all_data_t copy; lList *master_cqueue_list = *(object_type_get_master_list(SGE_TYPE_CQUEUE)); lList *master_job_list = *object_type_get_master_list(SGE_TYPE_JOB); lList *master_userset_list = *object_type_get_master_list(SGE_TYPE_USERSET); lList *master_project_list = *object_type_get_master_list(SGE_TYPE_PROJECT); lList *master_exechost_list= *object_type_get_master_list(SGE_TYPE_EXECHOST); lList *master_rqs_list= *object_type_get_master_list(SGE_TYPE_RQS); lList *master_centry_list = *object_type_get_master_list(SGE_TYPE_CENTRY); lList *master_ckpt_list = *object_type_get_master_list(SGE_TYPE_CKPT); lList *master_user_list = *object_type_get_master_list(SGE_TYPE_USER); lList *master_ar_list = *object_type_get_master_list(SGE_TYPE_AR); lList *master_pe_list = *object_type_get_master_list(SGE_TYPE_PE); lList *master_hgrp_list = *object_type_get_master_list(SGE_TYPE_HGROUP); lList *master_sharetree_list = *object_type_get_master_list(SGE_TYPE_SHARETREE); /* delay scheduling for test purposes, see issue GE-3306 */ if (SGE_TEST_DELAY_SCHEDULING > 0) { sleep(SGE_TEST_DELAY_SCHEDULING); } PROF_START_MEASUREMENT(SGE_PROF_CUSTOM6); PROF_START_MEASUREMENT(SGE_PROF_CUSTOM7); if (__CONDITION(INFOPRINT)) { dstring ds; char buffer[128]; sge_dstring_init(&ds, buffer, sizeof(buffer)); DPRINTF(("================[SCHEDULING-EPOCH %s]==================\n", sge_at_time(0, &ds))); sge_dstring_free(&ds); } /* * If there were new events then * copy/filter data necessary for the scheduler run * and run the scheduler method */ memset(©, 0, sizeof(copy)); copy.dept_list = lSelect("", master_userset_list, where_what.where_dept, where_what.what_acldept); copy.acl_list = lSelect("", master_userset_list, where_what.where_acl, where_what.what_acldept); DPRINTF(("RAW CQ:%d, J:%d, H:%d, C:%d, A:%d, D:%d, P:%d, CKPT:%d," " US:%d, PR:%d, RQS:%d, AR:%d, S:nd:%d/lf:%d\n", lGetNumberOfElem(master_cqueue_list), lGetNumberOfElem(master_job_list), lGetNumberOfElem(master_exechost_list), lGetNumberOfElem(master_centry_list), lGetNumberOfElem(copy.acl_list), lGetNumberOfElem(copy.dept_list), lGetNumberOfElem(master_project_list), lGetNumberOfElem(master_ckpt_list), lGetNumberOfElem(master_user_list), lGetNumberOfElem(master_project_list), lGetNumberOfElem(master_rqs_list), lGetNumberOfElem(master_ar_list), lGetNumberOfNodes(NULL, master_sharetree_list, STN_children), lGetNumberOfLeafs(NULL, master_sharetree_list, STN_children) )); sge_rebuild_job_category(master_job_list, master_userset_list, master_project_list, master_rqs_list); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM7); prof_init = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, NULL); PROF_START_MEASUREMENT(SGE_PROF_CUSTOM7); sge_before_dispatch(evc); /* prepare data for the scheduler itself */ copy.host_list = lCopyList("", master_exechost_list); /* * Within the scheduler we do only need QIs */ { lListElem *cqueue = NULL; lEnumeration *what_queue3 = NULL; for_each(cqueue, master_cqueue_list) { lList *qinstance_list = lGetList(cqueue, CQ_qinstances); lList *t; if (!qinstance_list) { continue; } /* all_queue_list contains all queue instances with state and full queue name only */ if (!what_queue3) { what_queue3 = lWhat("%T(%I%I)", lGetListDescr(qinstance_list), QU_full_name, QU_state); } t = lSelect("t", qinstance_list, NULL, what_queue3); if (t) { if (copy.all_queue_list == NULL) { copy.all_queue_list = lCreateList("all", lGetListDescr(t)); } lAppendList(copy.all_queue_list, t); lFreeList (&t); } t = lSelect("t", qinstance_list, where_what.where_queue, where_what.what_queue2); if (t) { if (copy.queue_list == NULL) { copy.queue_list = lCreateList("enabled", lGetListDescr(t)); } lAppendList(copy.queue_list, t); lFreeList (&t); } t = lSelect("t", qinstance_list, where_what.where_queue2, where_what.what_queue2); if (t) { if (copy.dis_queue_list == NULL) { copy.dis_queue_list = lCreateList("disabled", lGetListDescr(t)); } lAppendList(copy.dis_queue_list, t); lFreeList (&t); } } if (what_queue3) { lFreeWhat(&what_queue3); } } if (sconf_is_job_category_filtering()) { copy.job_list = sge_category_job_copy(copy.queue_list, &orders, evc->monitor_next_run); } else { copy.job_list = lCopyList("", master_job_list); } /* no need to copy these lists, they are read only used */ copy.centry_list = master_centry_list; copy.ckpt_list = master_ckpt_list; copy.hgrp_list = master_hgrp_list; /* these lists need to be copied because they are modified during scheduling run */ copy.share_tree = lCopyList("", master_sharetree_list); copy.pe_list = lCopyList("", master_pe_list); copy.user_list = lCopyList("", master_user_list); copy.project_list = lCopyList("", master_project_list); copy.rqs_list = lCopyList("", master_rqs_list); copy.ar_list = lCopyList("", master_ar_list); /* report number of reduced and raw (in brackets) lists */ DPRINTF(("Q:%d, AQ:%d J:%d(%d), H:%d(%d), C:%d, A:%d, D:%d, P:%d, CKPT:%d," " US:%d, PR:%d, RQS:%d, AR:%d, S:nd:%d/lf:%d \n", lGetNumberOfElem(copy.queue_list), lGetNumberOfElem(copy.all_queue_list), lGetNumberOfElem(copy.job_list), lGetNumberOfElem(master_job_list), lGetNumberOfElem(copy.host_list), lGetNumberOfElem(master_exechost_list), lGetNumberOfElem(copy.centry_list), lGetNumberOfElem(copy.acl_list), lGetNumberOfElem(copy.dept_list), lGetNumberOfElem(copy.pe_list), lGetNumberOfElem(copy.ckpt_list), lGetNumberOfElem(copy.user_list), lGetNumberOfElem(copy.project_list), lGetNumberOfElem(copy.rqs_list), lGetNumberOfElem(copy.ar_list), lGetNumberOfNodes(NULL, copy.share_tree, STN_children), lGetNumberOfLeafs(NULL, copy.share_tree, STN_children) )); if (getenv("SGE_ND")) { printf("Q:%d, AQ:%d J:%d(%d), H:%d(%d), C:%d, A:%d, D:%d, " "P:%d, CKPT:%d, US:%d, PR:%d, RQS:%d, AR:%d, S:nd:%d/lf:%d \n", lGetNumberOfElem(copy.queue_list), lGetNumberOfElem(copy.all_queue_list), lGetNumberOfElem(copy.job_list), lGetNumberOfElem(master_job_list), lGetNumberOfElem(copy.host_list), lGetNumberOfElem(master_exechost_list), lGetNumberOfElem(copy.centry_list), lGetNumberOfElem(copy.acl_list), lGetNumberOfElem(copy.dept_list), lGetNumberOfElem(copy.pe_list), lGetNumberOfElem(copy.ckpt_list), lGetNumberOfElem(copy.user_list), lGetNumberOfElem(copy.project_list), lGetNumberOfElem(copy.rqs_list), lGetNumberOfElem(copy.ar_list), lGetNumberOfNodes(NULL, copy.share_tree, STN_children), lGetNumberOfLeafs(NULL, copy.share_tree, STN_children) ); } else { schedd_log("-------------START-SCHEDULER-RUN-------------", NULL, evc->monitor_next_run); } PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM7); prof_copy = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, NULL); PROF_START_MEASUREMENT(SGE_PROF_CUSTOM7); scheduler_method(evc, &answer_list, ©, &orders); answer_list_output(&answer_list); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM7); prof_run = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, NULL); PROF_START_MEASUREMENT(SGE_PROF_CUSTOM7); /* .. which gets deleted after using */ lFreeList(&(copy.host_list)); lFreeList(&(copy.queue_list)); lFreeList(&(copy.dis_queue_list)); lFreeList(&(copy.all_queue_list)); lFreeList(&(copy.job_list)); lFreeList(&(copy.acl_list)); lFreeList(&(copy.dept_list)); lFreeList(&(copy.pe_list)); lFreeList(&(copy.share_tree)); lFreeList(&(copy.user_list)); lFreeList(&(copy.project_list)); lFreeList(&(copy.rqs_list)); lFreeList(&(copy.ar_list)); PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM7); prof_free = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, NULL); /* * need to sync with event master thread * if schedd configuration changed then settings in evm can be adjusted */ if (sconf_is_new_config()) { /* set scheduler interval / event delivery interval */ u_long32 interval = sconf_get_schedule_interval(); if (evc->ec_get_edtime(evc) != interval) { evc->ec_set_edtime(evc, interval); } /* set job / ja_task event flushing */ set_job_flushing(evc); /* no need to ec_commit here - we do it when resetting the busy state */ /* now we handled the new schedd config - no need to do it twice */ sconf_reset_new_config(); } /* block till master handled all GDI orders */ sge_schedd_block_until_orders_processed(evc->get_gdi_ctx(evc), NULL); schedd_order_destroy(); /* * Stop profiling for "schedd run total" and the subcategories */ PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM6); prof_total = prof_get_measurement_wallclock(SGE_PROF_CUSTOM6, true, NULL); if (prof_is_active(SGE_PROF_CUSTOM6)) { PROFILING((SGE_EVENT, "PROF: schedd run took: %.3f s (init: %.3f s, copy: %.3f s, " "run:%.3f, free: %.3f s, jobs: %d, categories: %d/%d)", prof_total, prof_init, prof_copy, prof_run, prof_free, lGetNumberOfElem(*object_type_get_master_list(SGE_TYPE_JOB)), sge_category_count(), sge_cs_category_count() )); } if (getenv("SGE_ND") != NULL) { printf("--------------STOP-SCHEDULER-RUN-------------\n"); } else { schedd_log("--------------STOP-SCHEDULER-RUN-------------", NULL, evc->monitor_next_run); } thread_output_profiling("scheduler thread profiling summary:\n", &next_prof_output); sge_monitor_output(&monitor); } /* reset the busy state */ evc->ec_set_busy(evc, 0); evc->ec_commit(evc, NULL); /* stop logging into schedd_runlog (enabled via -tsm) */ evc->monitor_next_run = false; /* * pthread cancelation point * * sge_scheduler_cleanup_thread() is the last function which should * be called so it is pushed first */ pthread_cleanup_push(sge_scheduler_cleanup_thread, (void *) &ctx); pthread_cleanup_push((void (*)(void *))sge_scheduler_cleanup_monitor, (void *)&monitor); pthread_cleanup_push((void (*)(void *))sge_scheduler_cleanup_event_client, (void *)evc); cl_thread_func_testcancel(thread_config); pthread_cleanup_pop(execute); pthread_cleanup_pop(execute); pthread_cleanup_pop(execute); DPRINTF(("passed cancelation point\n")); }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }