/*---------------------------------------------------------------------- * host_in_file * look if resolved host is in "file" * return * 0 if present * 1 if not * -1 error occured *----------------------------------------------------------------------*/ static int host_in_file( const char *host, const char *file ) { FILE *fp; char buf[512], *cp; DENTER(TOP_LAYER, "host_in_file"); fp = fopen(file, "r"); if (!fp) { DRETURN(-1); } while (fgets(buf, sizeof(buf), fp)) { for (cp = strtok(buf, " \t\n,"); cp; cp = strtok(NULL, " \t\n,")) { char* resolved_host = NULL; cl_com_cached_gethostbyname(cp,&resolved_host,NULL,NULL,NULL); if (resolved_host) { if (!sge_hostcmp(host, resolved_host )) { FCLOSE(fp); sge_free(&resolved_host); DRETURN(0); } sge_free(&resolved_host); } } } FCLOSE(fp); DRETURN(1); FCLOSE_ERROR: DRETURN(0); }
/****** qmaster/sge_qmaster_heartbeat/increment_heartbeat() ************************* * NAME * increment_heartbeat() -- Event handler for heartbeat events * * SYNOPSIS * void increment_heartbeat(te_event_t anEvent) * * FUNCTION * Update qmaster heartbeat file. * * INPUTS * te_event_t anEvent - heartbeat event * * RESULT * void - none * * NOTES * MT-NOTE: increment_hearbeat() is NOT MT safe. This function is only * MT-NOTE: invoked from within the event delivery thread. * * We do assume that the system clock does NOT run backwards. However, we * do cope with a system clock which has been put back. * *******************************************************************************/ void increment_heartbeat(sge_gdi_ctx_class_t *ctx, te_event_t anEvent, monitoring_t *monitor) { int retval = 0; int heartbeat = 0; int check_act_qmaster_file = 0; char act_qmaster_name[CL_MAXHOSTLEN]; char act_resolved_qmaster_name[CL_MAXHOSTLEN]; char err_str[SGE_PATH_MAX+128]; const char *act_qmaster_file = ctx->get_act_qmaster_file(ctx); const char *qualified_hostname = ctx->get_qualified_hostname(ctx); DENTER(TOP_LAYER, "increment_heartbeat"); retval = inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30, &heartbeat); switch(retval) { case 0: { DPRINTF(("(heartbeat) - incremented (or created) heartbeat file: %s(beat=%d)\n", QMASTER_HEARTBEAT_FILE, heartbeat)); break; } default: { DPRINTF(("(heartbeat) - inc_qmaster_heartbeat() returned %d !!! (beat=%d)\n", retval, heartbeat)); check_act_qmaster_file = 1; break; } } if (heartbeat % 20 == 0) { DPRINTF(("(heartbeat) - checking act_qmaster file this time\n")); check_act_qmaster_file = 1; } if (check_act_qmaster_file == 1) { strcpy(err_str,""); if (get_qm_name(act_qmaster_name, act_qmaster_file, err_str) == 0) { /* got qmaster name */ if ( getuniquehostname(act_qmaster_name, act_resolved_qmaster_name, 0) == CL_RETVAL_OK && sge_hostcmp(act_resolved_qmaster_name, qualified_hostname) != 0 ) { /* act_qmaster file has been changed */ WARNING((SGE_EVENT, SFNMAX, MSG_HEART_ACT_QMASTER_FILE_CHANGED)); if (sge_qmaster_shutdown_via_signal_thread(100) != 0) { ERROR((SGE_EVENT, SFNMAX, MSG_HEART_CANT_SIGNAL)); /* TODO: here the ctx reference is not transported back ** event_handler functions should use &ctx instead */ sge_shutdown((void**)&ctx, 1); } } else { DPRINTF(("(heartbeat) - act_qmaster file contains hostname "SFQ"\n", act_qmaster_name)); } } else { WARNING((SGE_EVENT, MSG_HEART_CANNOT_READ_FILE_S, err_str )); } } DEXIT; return; } /* increment_heartbeat() */
/*----------------------------------------------------------------- * compare_qmaster_names * see if old qmaster name and current qmaster name are still the same *-----------------------------------------------------------------*/ static int compare_qmaster_names( const char *act_qmaster_file, const char *oldqmaster ) { char newqmaster[SGE_PATH_MAX]; int ret; DENTER(TOP_LAYER, "compare_qmaster_names"); if (get_qm_name(newqmaster, act_qmaster_file, NULL)) { WARNING((SGE_EVENT, MSG_SHADOWD_CANTREADACTQMASTERFILEX_S, act_qmaster_file)); DRETURN(-1); } ret = sge_hostcmp(newqmaster, oldqmaster); DPRINTF(("strcmp() of old and new qmaster returns: "sge_U32CFormat"\n", sge_u32c(ret))); DRETURN(ret); }
/*---------------------------------------------------------- * MatchPattern * Evalute a pattern expression * RETURNS match depend on type of resource *----------------------------------------------------------*/ static int MatchPattern(s_token *token_p, bool skip) { int match; /*printf("Match pattern %i: '%s'=='%s'\n", skip,token_p->pattern, token_p->value); */ if (skip==true){ return -1; } if (token_p->pattern==NULL){ return -1; } if (token_p->has_patterns ){ switch(token_p->type){ case TYPE_STR: case TYPE_CSTR: case TYPE_RESTR: match = fnmatch(token_p->pattern, token_p->value, 0); break; case TYPE_HOST: match = sge_hostmatch(token_p->pattern, token_p->value); break; default: match = -1; } } else { /* optimized for non pattern stuff */ switch(token_p->type){ case TYPE_STR: case TYPE_RESTR: match = strcmp(token_p->pattern, token_p->value); break; case TYPE_CSTR: match = strcasecmp(token_p->pattern, token_p->value); break; case TYPE_HOST: match = sge_hostcmp(token_p->pattern, token_p->value); break; default: match = -1; } } return ((match==0) ? 0 : 1); }
/****** sge_c_report() ******************************************************* * NAME * sge_c_report() -- process execd load report * * SYNOPSIS * void sge_c_report(char *rhost, char *commproc, int id, lList *report_list) * * FUNCTION * * INPUTS * char *rhost * char *commproc * int id * lList *report_list * * RESULT * void - nothing * * NOTES * MT-NOTE: sge_c_report() is MT safe * ******************************************************************************/ void sge_c_report(sge_gdi_ctx_class_t *ctx, char *rhost, char *commproc, int id, lList *report_list, monitoring_t *monitor) { lListElem *hep = NULL; u_long32 rep_type; lListElem *report; int ret = 0; u_long32 this_seqno, last_seqno; u_long32 rversion; sge_pack_buffer pb; bool is_pb_used = false; bool send_tag_new_conf = false; DENTER(TOP_LAYER, "sge_c_report"); if (lGetNumberOfElem(report_list) == 0) { DPRINTF(("received empty report\n")); if (rhost != NULL) { WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, rhost)); } else { WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, "unknown")); } DRETURN_VOID; } /* accept reports only from execd's */ if (strcmp(prognames[EXECD], commproc)) { ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNCOMMPROC_S, commproc)); DRETURN_VOID; } /* do not process load reports from old execution daemons */ rversion = lGetUlong(lFirst(report_list), REP_version); if (verify_request_version(NULL, rversion, rhost, commproc, id)) { DRETURN_VOID; } this_seqno = lGetUlong(lFirst(report_list), REP_seqno); /* need exec host for all types of reports */ if (!(hep = host_list_locate(*object_type_get_master_list(SGE_TYPE_EXECHOST), rhost))) { ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNEXECHOST_S, rhost)); DRETURN_VOID; } /* prevent old reports being proceeded frequent loggings of outdated reports can be an indication of too high message traffic arriving at qmaster */ last_seqno = lGetUlong(hep, EH_report_seqno); if ((this_seqno < last_seqno && (last_seqno - this_seqno) <= 9000) && !(last_seqno > 9990 && this_seqno < 10)) { /* this must be an old report, log and then ignore it */ INFO((SGE_EVENT, MSG_QMASTER_RECEIVED_OLD_LOAD_REPORT_UUS, sge_u32c(this_seqno), sge_u32c(last_seqno), rhost)); DRETURN_VOID; } lSetUlong(hep, EH_report_seqno, this_seqno); /* RU: */ /* tag all reschedule_unknown list entries we hope to hear about in that job report */ update_reschedule_unknown_list(ctx, hep); /* ** process the reports one after the other ** usually there will be a load report ** and a configuration version report */ for_each(report, report_list) { rep_type = lGetUlong(report, REP_type); switch (rep_type) { case NUM_REP_REPORT_LOAD: case NUM_REP_FULL_REPORT_LOAD: MONITOR_ELOAD(monitor); /* Now handle execds load reports */ if (lGetUlong(hep, EH_lt_heard_from) == 0 && rep_type != NUM_REP_FULL_REPORT_LOAD) { host_notify_about_full_load_report(ctx, hep); } else { if (!is_pb_used) { is_pb_used = true; init_packbuffer(&pb, 1024, 0); } sge_update_load_values(ctx, rhost, lGetList(report, REP_list)); if (mconf_get_simulate_execds()) { lList *master_exechost_list = *object_type_get_master_list(SGE_TYPE_EXECHOST); lListElem *shep; lListElem *simhostElem=NULL; for_each(shep, master_exechost_list) { simhostElem = lGetSubStr(shep, CE_name, "load_report_host", EH_consumable_config_list); if (simhostElem != NULL) { const char *real_host = lGetString(simhostElem, CE_stringval); if (real_host != NULL && sge_hostcmp(real_host, rhost) == 0) { const char* sim_host = lGetHost(shep, EH_name); lListElem *clp = NULL; DPRINTF(("Copy load values of %s to simulated host %s\n", rhost, sim_host)); for_each(clp, lGetList(report, REP_list)) { if (strcmp(lGetHost(clp, LR_host), SGE_GLOBAL_NAME) != 0) { lSetHost(clp, LR_host, sim_host); } } sge_update_load_values(ctx, sim_host, lGetList(report, REP_list)); } } } } pack_ack(&pb, ACK_LOAD_REPORT, this_seqno, 0, NULL); } break; case NUM_REP_REPORT_CONF: MONITOR_ECONF(monitor); if (sge_compare_configuration(hep, lGetList(report, REP_list)) != 0) { DPRINTF(("%s: configuration on host %s is not up to date\n", SGE_FUNC, rhost)); send_tag_new_conf = true; } break; case NUM_REP_REPORT_PROCESSORS: /* ** save number of processors */ MONITOR_EPROC(monitor); ret = update_license_data(ctx, hep, lGetList(report, REP_list)); if (ret) { ERROR((SGE_EVENT, MSG_LICENCE_ERRORXUPDATINGLICENSEDATA_I, ret)); } break; case NUM_REP_REPORT_JOB: MONITOR_EJOB(monitor); if (!is_pb_used) { is_pb_used = true; init_packbuffer(&pb, 1024, 0); } process_job_report(ctx, report, hep, rhost, commproc, &pb, monitor); break; default: DPRINTF(("received invalid report type %ld\n", (long) rep_type)); }
lList *cull_unparse_job_parameter( sge_gdi_ctx_class_t *ctx, lList **pcmdline, lListElem *job, int flags ) { const char *cp; u_long32 ul; lList *answer = NULL; char str[1024 + 1]; lList *lp; int ret; lListElem *ep_opt; const char *username = ctx->get_username(ctx); const char *qualified_hostname = ctx->get_qualified_hostname(ctx); DENTER(TOP_LAYER, "cull_unparse_job_parameter"); /* ** -a ** problem with submission time, but that is not a good ** default option anyway, is not unparsed */ /* ** -A */ if (sge_unparse_account_string(job, pcmdline, &answer) != 0) { DEXIT; return answer; } /* ** -c */ if (sge_unparse_checkpoint_option(job, pcmdline, &answer) != 0) { DEXIT; return answer; } /* * -ckpt */ if (sge_unparse_string_option(job, JB_checkpoint_name, "-ckpt", pcmdline, &answer) != 0) { DEXIT; return answer; } /* ** -cwd */ if (lGetString(job, JB_cwd)) { ep_opt = sge_add_noarg(pcmdline, cwd_OPT, "-cwd", NULL); } /* * -P */ if (sge_unparse_string_option(job, JB_project, "-P", pcmdline, &answer) != 0) { DEXIT; return answer; } #if 0 /* ** -C */ if (sge_unparse_string_option(job, JB_directive_prefix, "-C", pcmdline, &answer) != 0) { DEXIT; return answer; } #endif /* ** -e */ if (sge_unparse_path_list(job, JB_stderr_path_list, "-e", pcmdline, &answer) != 0) { DEXIT; return answer; } /* ** -h, here only user hold supported at the moment */ if ((ul = lGetUlong(lFirst(lGetList(job, JB_ja_tasks)), JAT_hold))) { ep_opt = sge_add_noarg(pcmdline, h_OPT, "-h", NULL); } /* ** -hold_jid */ if ((lp = lGetList(job, JB_jid_request_list))) { int fields[] = { JRE_job_name, 0 }; const char *delis[] = {NULL, ",", NULL}; ret = uni_print_list(NULL, str, sizeof(str) - 1, lp, fields, delis, 0); if (ret) { DPRINTF(("Error %d formatting jid_request_list as -hold_jid\n", ret)); sprintf(str, MSG_LIST_ERRORFORMATINGJIDPREDECESSORLISTASHOLDJID); answer_list_add(&answer, str, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR); return answer; } ep_opt = sge_add_arg(pcmdline, hold_jid_OPT, lListT, "-hold_jid", str); lSetList(ep_opt, SPA_argval_lListT, lCopyList("hold_jid list", lp)); } /* ** -hold_jid_ad */ if ((lp = lGetList(job, JB_ja_ad_request_list))) { int fields[] = { JRE_job_name, 0 }; const char *delis[] = {NULL, ",", NULL}; ret = uni_print_list(NULL, str, sizeof(str) - 1, lp, fields, delis, 0); if (ret) { DPRINTF(("Error %d formatting ja_ad_request_list as -hold_jid_ad\n", ret)); sprintf(str, MSG_LIST_ERRORFORMATINGJIDPREDECESSORLISTASHOLDJIDAD); answer_list_add(&answer, str, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR); return answer; } ep_opt = sge_add_arg(pcmdline, hold_jid_ad_OPT, lListT, "-hold_jid_ad", str); lSetList(ep_opt, SPA_argval_lListT, lCopyList("hold_jid_ad list", lp)); } /* ** -i */ if (sge_unparse_path_list(job, JB_stdin_path_list, "-i", pcmdline, &answer) != 0) { DEXIT; return answer; } /* ** -j */ if ((ul = lGetBool(job, JB_merge_stderr))) { ep_opt = sge_add_arg(pcmdline, j_OPT, lIntT, "-j", "y"); lSetInt(ep_opt, SPA_argval_lIntT, true); } /* ** -jid */ if ((lp = lGetList(job, JB_job_identifier_list))) { int fields[] = { JRE_job_number, 0}; const char *delis[] = {"", ",", NULL}; ret = uni_print_list(NULL, str, sizeof(str) - 1, lp, fields, delis, 0); if (ret) { DPRINTF(("Error %d formatting job_identifier_list as -jid\n", ret)); sprintf(str, MSG_LIST_ERRORFORMATINGJOBIDENTIFIERLISTASJID); answer_list_add(&answer, str, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR); return answer; } ep_opt = sge_add_arg(pcmdline, jid_OPT, lListT, "-jid", str); lSetList(ep_opt, SPA_argval_lListT, lCopyList("jid list", lp)); } /* ** -js */ if ((ul = lGetUlong(job, JB_jobshare)) != 0) { sprintf(str, sge_u32, ul); ep_opt = sge_add_arg(pcmdline, js_OPT, lUlongT, "-js", str); lSetUlong(ep_opt, SPA_argval_lUlongT, ul); } /* ** -lj is in parsing but can't be unparsed here */ /* ** -l */ if (sge_unparse_resource_list(job, JB_hard_resource_list, pcmdline, &answer) != 0) { DEXIT; return answer; } if (sge_unparse_resource_list(job, JB_soft_resource_list, pcmdline, &answer) != 0) { DEXIT; return answer; } /* ** -m */ if ((ul = lGetUlong(job, JB_mail_options))) { cp = sge_unparse_mail_options(ul); if (!cp) { DPRINTF(("Error unparsing mail options\n")); sprintf(str, MSG_PARSE_ERRORUNPARSINGMAILOPTIONS); answer_list_add(&answer, str, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR); return answer; } ep_opt = sge_add_arg(pcmdline, m_OPT, lIntT, "-m", cp); lSetInt(ep_opt, SPA_argval_lIntT, ul); } /* ** -M obviously a problem!!! ** not unparsed at the moment ** does it make sense as a default, after all? */ if ((lp = lGetList(job, JB_mail_list))) { lList *lp_new = NULL; lListElem *ep_new = NULL; lListElem *ep = NULL; const char *host; const char *user; /* ** or rather take all if there are more than one elements? */ for_each(ep, lp) { user = lGetString(ep, MR_user); host = lGetHost(ep, MR_host); if (sge_strnullcmp(user, username) || sge_hostcmp(host, qualified_hostname)) { lp_new = lCreateList("mail list", MR_Type); ep_new = lAddElemStr(&lp_new, MR_user, user, MR_Type); lSetHost(ep_new, MR_host, host); } }
/****** qmaster/sge_mod_configuration() **************************************** * NAME * sge_mod_configuration() -- modify cluster configuration * * SYNOPSIS * int sge_mod_configuration(lListElem *aConf, lList **anAnswer, char *aUser, * char *aHost) * * FUNCTION * Modify cluster configuration. 'confp' is a pointer to a 'CONF_Type' list * element and does contain the modified configuration entry. Adding a new * configuration entry is also viewed as a modification. * * INPUTS * lListElem *aConf - CONF_Type element containing the modified conf * lList **anAnswer - answer list * char *aUser - target user * char *aHost - target host * * RESULT * int - 0 success * -1 error * * NOTES * MT-NOTE: sge_mod_configuration() is MT safe * *******************************************************************************/ int sge_mod_configuration(sge_gdi_ctx_class_t *ctx, lListElem *aConf, lList **anAnswer, char *aUser, char *aHost) { lListElem *old_conf; const char *tmp_name = NULL; char unique_name[CL_MAXHOSTLEN]; int ret = -1; const char *cell_root = ctx->get_cell_root(ctx); const char *qualified_hostname = ctx->get_qualified_hostname(ctx); u_long32 progid = ctx->get_who(ctx); DENTER(TOP_LAYER, "sge_mod_configuration"); if (!aConf || !aUser || !aHost) { CRITICAL((SGE_EVENT, MSG_SGETEXT_NULLPTRPASSED_S, SGE_FUNC)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((tmp_name = lGetHost(aConf, CONF_name)) == NULL) { CRITICAL((SGE_EVENT, MSG_SGETEXT_MISSINGCULLFIELD_SS, lNm2Str(CONF_name), SGE_FUNC)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((ret = sge_resolve_hostname(tmp_name, unique_name, EH_name, sizeof(unique_name))) != CL_RETVAL_OK) { DPRINTF(("%s: error %s resolving host %s\n", SGE_FUNC, cl_get_error_text(ret), tmp_name)); ERROR((SGE_EVENT, MSG_SGETEXT_CANTRESOLVEHOST_S, tmp_name)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((ret = check_config(anAnswer, aConf))) { DRETURN(ret); } if ((old_conf = sge_get_configuration_for_host(unique_name)) != NULL) { int ret = -1; ret = do_mod_config(ctx, unique_name, old_conf, aConf, anAnswer); lFreeElem(&old_conf); if (ret == 0) { INFO((SGE_EVENT, MSG_SGETEXT_MODIFIEDINLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF)); answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO); } else { DRETURN(STATUS_EUNKNOWN); } } else { do_add_config(ctx, unique_name, aConf, anAnswer); INFO((SGE_EVENT, MSG_SGETEXT_ADDEDTOLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF)); answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO); } if (strcmp(SGE_GLOBAL_NAME, unique_name) == 0) { sge_add_event(0, sgeE_GLOBAL_CONFIG, 0, 0, NULL, NULL, NULL, NULL); } /* ** is the configuration change relevant for the qmaster itsself? ** if so, initialise conf struct anew */ if (strcmp(unique_name, SGE_GLOBAL_NAME) == 0 || sge_hostcmp(unique_name, qualified_hostname) == 0) { lListElem *local = NULL; lListElem *global = NULL; lList *answer_list = NULL; char* qmaster_params = NULL; int accounting_flush_time = mconf_get_accounting_flush_time(); if ((local = sge_get_configuration_for_host(qualified_hostname)) == NULL) { WARNING((SGE_EVENT, MSG_CONF_NOLOCAL_S, qualified_hostname)); } if ((global = sge_get_configuration_for_host(SGE_GLOBAL_NAME)) == NULL) { ERROR((SGE_EVENT, SFNMAX, MSG_CONF_NOGLOBAL)); } if (merge_configuration(&answer_list, progid, cell_root, global, local, NULL) != 0) { ERROR((SGE_EVENT, MSG_CONF_CANTMERGECONFIGURATIONFORHOST_S, qualified_hostname)); } answer_list_output(&answer_list); /* Restart the accounting flush event if needed. */ if ((accounting_flush_time == 0) && (mconf_get_accounting_flush_time() != 0)) { te_event_t ev = te_new_event(time(NULL), TYPE_ACCOUNTING_TRIGGER, ONE_TIME_EVENT, 1, 0, NULL); te_add_event(ev); te_free_event(&ev); } lFreeElem(&local); lFreeElem(&global); sge_show_conf(); /* 'max_unheard' may have changed */ cl_commlib_set_connection_param(cl_com_get_handle("qmaster", 1), HEARD_FROM_TIMEOUT, mconf_get_max_unheard()); /* fetching qmaster_params and begin to parse */ qmaster_params = mconf_get_qmaster_params(); /* updating the commlib paramterlist and gdi_timeout with new or changed parameters */ cl_com_update_parameter_list(qmaster_params); sge_free(&qmaster_params); } /* invalidate configuration cache */ mconf_set_new_config(true); DRETURN(STATUS_OK); }