/****** uti/host/sge_gethostbyaddr() **************************************** * NAME * sge_gethostbyaddr() -- gethostbyaddr() wrapper * * SYNOPSIS * struct hostent *sge_gethostbyaddr(const struct in_addr *addr, int* system_error_retval) * * FUNCTION * Wraps gethostbyaddr() function calls, measures time spent * in gethostbyaddr() and logs when very much time has passed. * On error, return error code in *system_error_retval if that is non-null. * * return value must be released by function caller (don't forget the * char** array lists inside of struct hostent) * * If possible (libcomm linked) use cl_com_cached_gethostbyaddr() * from libcomm. This will return an sge aliased hostname. * * * NOTES * MT-NOTE: sge_gethostbyaddr() is MT safe * MT-NOTE: sge_gethostbyaddr() uses a mutex to guard access to the * MT-NOTE: gethostbyaddr() system call on all platforms other than Solaris, * MT-NOTE: Linux, and HP-UX. Therefore, except on the aforementioned * MT-NOTE: platforms, MT calls to gethostbyaddr() must go through * MT-NOTE: sge_gethostbyaddr() to be MT safe. *******************************************************************************/ struct hostent *sge_gethostbyaddr(const struct in_addr *addr, int* system_error_retval) { struct hostent *he = NULL; time_t now; time_t time; int l_errno; DENTER(TOP_LAYER, "sge_gethostbyaddr"); /* This method goes to great lengths to slip a reentrant gethostbyaddr into * the code without making changes to the rest of the source base. That * basically means that we have to make some redundant copies to * return to the caller. This method doesn't appear to be highly utilized, * so that's probably ok. If it's not ok, the interface can be changed * later. */ gethostbyaddr_calls++; /* profiling */ now = (time_t)sge_get_gmt(); #ifdef GETHOSTBYADDR_R8 #define SGE_GETHOSTBYADDR_FOUND /* This is for Linux */ DPRINTF (("Getting host by addr - Linux\n")); { struct hostent re; char buffer[4096]; /* No need to malloc he because it will end up pointing to re. */ gethostbyaddr_r ((const char *)addr, 4, AF_INET, &re, buffer, 4096, &he, &l_errno); /* Since re contains pointers into buffer, and both re and the buffer go * away when we exit this code block, we make a deep copy to return. */ /* Yes, I do mean to check if he is NULL and then copy re! No, he * doesn't need to be freed first. */ if (he != NULL) { he = sge_copy_hostent (&re); } } #endif #ifdef GETHOSTBYADDR_R7 #define SGE_GETHOSTBYADDR_FOUND /* This is for Solaris */ DPRINTF(("Getting host by addr - Solaris\n")); { char buffer[4096]; struct hostent *help_he = NULL; he = (struct hostent *)malloc(sizeof(struct hostent)); if (he != NULL) { memset(he, 0, sizeof(struct hostent)); /* On Solaris, this function returns the pointer to my struct on success * and NULL on failure. */ help_he = gethostbyaddr_r((const char *)addr, 4, AF_INET, he, buffer, 4096, &l_errno); /* Since he contains pointers into buffer, and buffer goes away when we * exit this code block, we make a deep copy to return. */ if (help_he != NULL) { struct hostent *new_he = sge_copy_hostent(help_he); sge_free(&he); he = new_he; } else { sge_free(&he); } } } #endif #ifdef GETHOSTBYADDR_R5 #define SGE_GETHOSTBYADDR_FOUND /* This is for HPUX < 11 */ DPRINTF(("Getting host by addr - 3 arg\n")); { struct hostent_data he_data; memset(&he_data, 0, sizeof(he_data)); he = (struct hostent *)malloc (sizeof (struct hostent)); if (he != NULL) { memset(he, 0, sizeof(struct hostent)); if (gethostbyaddr_r ((const char *)addr, 4, AF_INET, he, &he_data) < 0) { /* If this function fails, free he so that we can test if it's NULL * later in the code. */ sge_free(&he); } /* The location of the error code is actually undefined. I'm just * assuming that it's in h_errno since that's where it is in the unsafe * version. * h_errno is, of course, not thread safe, but if there's an error we're * already screwed, so we won't worry to much about it. * An alternative would be to set errno to HOST_NOT_FOUND. */ l_errno = h_errno; /* Since he contains pointers into he_data, and he_data goes away when we * exit this code block, we make a deep copy to return. */ if (he != NULL) { struct hostent *new_he = sge_copy_hostent (he); sge_free(&he); he = new_he; } } } #endif #ifdef GETHOSTBYADDR #define SGE_GETHOSTBYADDR_FOUND /* This is for HPUX >= 11 */ DPRINTF(("Getting host by addr - Thread safe\n")); he = gethostbyaddr((const char *)addr, 4, AF_INET); /* * JG: TODO: shouldn't it be * he = gethostbyaddr((const char *)addr, sizeof(struct in_addr), AF_INET); */ /* The location of the error code is actually undefined. I'm just * assuming that it's in h_errno since that's where it is in the unsafe * version. * h_errno is, of course, not thread safe, but if there's an error we're * already screwed, so we won't worry too much about it. * An alternative would be to set errno to HOST_NOT_FOUND. */ l_errno = h_errno; if (he != NULL) { struct hostent *new_he = sge_copy_hostent(he); /* do not free he, there was no malloc() */ he = new_he; } #endif #ifdef GETHOSTBYADDR_M #define SGE_GETHOSTBYADDR_FOUND /* This is for everyone else. */ DPRINTF (("Getting host by addr - Mutex guarded\n")); sge_mutex_lock("hostbyaddr", SGE_FUNC, __LINE__, &hostbyaddr_mutex); /* JG: TODO: shouldn't it always be sizeof(struct in_addr)? */ he = gethostbyaddr((const char *)addr, 4, AF_INET); l_errno = h_errno; if (he != NULL) { struct hostent *new_he = sge_copy_hostent(he); /* do not free he, there was no malloc() */ he = new_he; } sge_mutex_unlock("hostbyaddr", SGE_FUNC, __LINE__, &hostbyaddr_mutex); #endif #ifndef SGE_GETHOSTBYADDR_FOUND #error "no sge_gethostbyaddr() definition for this architecture." #endif time = (time_t)sge_get_gmt() - now; gethostbyaddr_sec += time; /* profiling */ /* warn about blocking gethostbyaddr() calls */ if (time > MAX_RESOLVER_BLOCKING) { WARNING((SGE_EVENT, "gethostbyaddr() took %d seconds and returns %s", (int)time, he?"success": (l_errno == HOST_NOT_FOUND)?"HOST_NOT_FOUND": (l_errno == TRY_AGAIN)?"TRY_AGAIN": (l_errno == NO_RECOVERY)?"NO_RECOVERY": (l_errno == NO_DATA)?"NO_DATA": (l_errno == NO_ADDRESS)?"NO_ADDRESS":"<unknown error>")); } if (system_error_retval != NULL) { *system_error_retval = l_errno; } DEXIT; return he; }
int main(int argc, char **argv) { lList *opts_cmdline = NULL; lList *opts_defaults = NULL; lList *opts_scriptfile = NULL; lList *opts_all = NULL; lListElem *job = NULL; lList *alp = NULL; lListElem *ep; int exit_status = 0; int just_verify; int tmp_ret; int wait_for_job = 0, is_immediate = 0; dstring session_key_out = DSTRING_INIT; dstring diag = DSTRING_INIT; dstring jobid = DSTRING_INIT; u_long32 start, end, step; u_long32 num_tasks; int count, stat; char *jobid_string = NULL; bool has_terse = false; drmaa_attr_values_t *jobids = NULL; u_long32 prog_number = 0; u_long32 myuid = 0; const char *sge_root = NULL; const char *cell_root = NULL; const char *username = NULL; const char *qualified_hostname = NULL; const char *unqualified_hostname = NULL; const char *mastername = NULL; DENTER_MAIN(TOP_LAYER, "qsub"); prof_mt_init(); /* Set up the program information name */ sge_setup_sig_handlers(QSUB); DPRINTF(("Initializing JAPI\n")); if (japi_init(NULL, NULL, NULL, QSUB, false, NULL, &diag) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); DEXIT; SGE_EXIT((void**)&ctx, 1); } prog_number = ctx->get_who(ctx); myuid = ctx->get_uid(ctx); sge_root = ctx->get_sge_root(ctx); cell_root = ctx->get_cell_root(ctx); username = ctx->get_username(ctx); qualified_hostname = ctx->get_qualified_hostname(ctx); unqualified_hostname = ctx->get_unqualified_hostname(ctx); mastername = ctx->get_master(ctx, false); /* * read switches from the various defaults files */ opt_list_append_opts_from_default_files(prog_number, cell_root, username, &opts_defaults, &alp, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, NULL, MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } /* * append the commandline switches to the list */ opt_list_append_opts_from_qsub_cmdline(prog_number, &opts_cmdline, &alp, argv + 1, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_QSUB_WARNING_S); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } /* * show usage if -help was in commandline */ if (opt_list_has_X(opts_cmdline, "-help")) { sge_usage(QSUB, stdout); DEXIT; SGE_EXIT((void**)&ctx, 0); } /* * We will only read commandline options from script file if the script * itself should not be handled as binary */ if (opt_list_is_X_true(opts_cmdline, "-b") || (!opt_list_has_X(opts_cmdline, "-b") && opt_list_is_X_true(opts_defaults, "-b"))) { DPRINTF(("Skipping options from script due to -b option\n")); } else { opt_list_append_opts_from_script(prog_number, &opts_scriptfile, &alp, opts_cmdline, environ); tmp_ret = answer_list_print_err_warn(&alp, NULL, MSG_QSUB_COULDNOTREADSCRIPT_S, MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } } /* * Merge all commandline options and interpret them */ opt_list_merge_command_lines(&opts_all, &opts_defaults, &opts_scriptfile, &opts_cmdline); /* * Check if -terse is requested */ if (opt_list_has_X(opts_all, "-terse")) { has_terse = true; } /* If "-sync y" is set, wait for the job to end. */ /* Remove all -sync switches since cull_parse_job_parameter() * doesn't know what to do with them. */ while ((ep = lGetElemStr(opts_all, SPA_switch, "-sync"))) { if (lGetInt(ep, SPA_argval_lIntT) == TRUE) { wait_for_job = 1; } lRemoveElem(opts_all, &ep); } if (wait_for_job) { DPRINTF(("Wait for job end\n")); } alp = cull_parse_job_parameter(myuid, username, cell_root, unqualified_hostname, qualified_hostname, opts_all, &job); tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_WARNING); if (tmp_ret > 0) { DEXIT; SGE_EXIT((void**)&ctx, tmp_ret); } if (set_sec_cred(sge_root, mastername, job, &alp) != 0) { answer_list_output(&alp); DEXIT; SGE_EXIT((void**)&ctx, 1); } /* Check if job is immediate */ is_immediate = (int)JOB_TYPE_IS_IMMEDIATE(lGetUlong(job, JB_type)); DPRINTF(("Job is%s immediate\n", is_immediate ? "" : " not")); DPRINTF(("Everything ok\n")); if (lGetUlong(job, JB_verify)) { cull_show_job(job, 0, false); DEXIT; SGE_EXIT((void**)&ctx, 0); } if (is_immediate || wait_for_job) { pthread_t sigt; qsub_setup_sig_handlers(); if (pthread_create(&sigt, NULL, sig_thread, (void *)NULL) != 0) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, " error preparing signal handling thread"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } if (japi_enable_job_wait(username, unqualified_hostname, NULL, &session_key_out, error_handler, &diag) == DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) { const char *msg = sge_dstring_get_string(&diag); fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S, msg?msg:" error starting event client thread"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } } job_get_submit_task_ids(job, &start, &end, &step); num_tasks = (end - start) / step + 1; if (JOB_TYPE_IS_ARRAY(lGetUlong(job, JB_type))) { int error = japi_run_bulk_jobs(&jobids, &job, start, end, step, false, &diag); if (error != DRMAA_ERRNO_SUCCESS) { /* No active session here means that japi_enable_job_wait() was * interrupted by the signal handler, in which case we just break out * quietly. */ if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } /* BUGFIX: Issuezilla #1013 * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code * back into a GDI error code. This is the easy solution. The * correct solution would be to address issue #859, presumably by * having JAPI reuse the GDI error codes instead of the JAPI error * codes. */ if (error == DRMAA_ERRNO_TRY_LATER) { exit_status = STATUS_NOTOK_DOAGAIN; } else { exit_status = 1; } goto Error; } DPRINTF(("job id is: %ld\n", (long) jobids->it.ji.jobid)); jobid_string = get_bulk_jobid_string((long)jobids->it.ji.jobid, start, end, step); } else if (num_tasks == 1) { int error = japi_run_job(&jobid, &job, false, &diag); if (error != DRMAA_ERRNO_SUCCESS) { if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } /* BUGFIX: Issuezilla #1013 * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code * back into a GDI error code. This is the easy solution. The * correct solution would be to address issue #859, presumably by * having JAPI reuse the GDI error codes instead of the DRMAA error * codes. */ if (error == DRMAA_ERRNO_TRY_LATER) { exit_status = STATUS_NOTOK_DOAGAIN; } else { exit_status = 1; } goto Error; } jobid_string = strdup(sge_dstring_get_string(&jobid)); DPRINTF(("job id is: %s\n", jobid_string)); sge_dstring_free(&jobid); } else { fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, "invalid task structure"); fprintf(stderr, "\n"); exit_status = 1; goto Error; } /* only success message is printed to stdout */ just_verify = (lGetUlong(job, JB_verify_suitable_queues)==JUST_VERIFY || lGetUlong(job, JB_verify_suitable_queues)==POKE_VERIFY); DPRINTF(("Just verifying job\n")); if (!just_verify) { const char *output = sge_dstring_get_string(&diag); /* print the tersed output */ if (has_terse) { printf("%s", jobid_string); } else if (output != NULL) { printf("%s", output); } else { printf(MSG_QSUB_YOURJOBHASBEENSUBMITTED_SS, jobid_string, lGetString(job, JB_job_name)); } printf("\n"); } else { printf("%s\n", MSG_JOB_VERIFYFOUNDQ); } if ((wait_for_job || is_immediate) && !just_verify) { int event; if (is_immediate) { fprintf(stderr, "%s\n", MSG_QSUB_WAITINGFORIMMEDIATEJOBTOBESCHEDULED); /* We only need to wait for the first task to be scheduled to be able * to say that the job is running. */ tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_START, &event, NULL, &diag); if ((tmp_ret == DRMAA_ERRNO_SUCCESS) && (event == JAPI_JOB_START)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_YOURIMMEDIATEJOBXHASBEENSUCCESSFULLYSCHEDULED_S, jobid_string); fprintf(stderr, "\n"); } /* A job finish event here means that the job was rejected. */ else if ((tmp_ret == DRMAA_ERRNO_SUCCESS) && (event == JAPI_JOB_FINISH)) { fprintf(stderr, "\n%s\n", MSG_QSUB_YOURQSUBREQUESTCOULDNOTBESCHEDULEDDTRYLATER); exit_status = 1; goto Error; } else { /* Since we told japi_wait to wait forever, we know that if it gets * a timeout, it's because it's been interrupted to exit, in which * case we don't complain. Same for no active session. */ if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) && (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } exit_status = 1; goto Error; } } if (wait_for_job) { /* Rather than using japi_synchronize on ALL for bulk jobs, we use * japi_wait on ANY num_tasks times because with synchronize, we would * have to wait for all the tasks to finish before we know if any * finished. */ for (count = 0; count < num_tasks; count++) { /* Since there's only one running job in the session, we can just * wait for ANY. */ if ((tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event, NULL, &diag)) != DRMAA_ERRNO_SUCCESS) { if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) && (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } exit_status = 1; goto Error; } /* report how job finished */ /* If the job is an array job, use the first non-zero exit code as * the exit code for qsub. */ if (exit_status == 0) { exit_status = report_exit_status(stat, sge_dstring_get_string(&jobid)); } /* If we've already found a non-zero exit code, just print the exit * info for the task. */ else { report_exit_status(stat, sge_dstring_get_string(&jobid)); } } } } Error: sge_free(&jobid_string); lFreeList(&alp); lFreeList(&opts_all); if ((tmp_ret = japi_exit(JAPI_EXIT_NO_FLAG, &diag)) != DRMAA_ERRNO_SUCCESS) { if (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) { fprintf(stderr, "\n"); fprintf(stderr, MSG_QSUB_COULDNOTFINALIZEENV_S, sge_dstring_get_string(&diag)); fprintf(stderr, "\n"); } else { struct timespec ts; /* We know that if we get a DRMAA_ERRNO_NO_ACTIVE_SESSION here, it's * because the signal handler thread called japi_exit(). We know this * because if the call to japi_init() fails, we just exit directly. * If the call to japi_init() succeeds, then we have an active session, * so coming here because of an error would not result in the * DRMAA_ERRNO_NO_ACTIVE_SESSION error. */ DPRINTF(("Sleeping for 15 seconds to wait for the exit to finish.\n")); sge_relative_timespec(15, &ts); sge_mutex_lock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex); while (!exited) { if (pthread_cond_timedwait(&exit_cv, &exit_mutex, &ts) == ETIMEDOUT) { DPRINTF(("Exit has not finished after 15 seconds. Exiting.\n")); break; } } sge_mutex_unlock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex); } } sge_prof_cleanup(); /* This is an exit() instead of an SGE_EXIT() because when the qmaster is * supended, SGE_EXIT() hangs. */ exit(exit_status); DEXIT; return exit_status; }
/****** uti/log/log_context_destroy() **************************************** * NAME * log_context_destroy() -- Free thread local storage * * SYNOPSIS * static void log_context_destroy(void* theState) * * FUNCTION * Free thread local storage. * * INPUTS * void* theState - Pointer to memroy which should be freed. * * RESULT * static void - none * * NOTES * MT-NOTE: log_context_destroy() is MT safe. * *******************************************************************************/ static void log_context_destroy(void* theContext) { sge_free((char*)theContext); }
/****** shepherd_binding/do_core_binding() ************************************* * NAME * do_core_binding() -- Performs the core binding task for the Linux OS. * * SYNOPSIS * int do_core_binding(void) * * FUNCTION * Performs core binding on shepherd side. All information required for * the binding is communicated from execd to shepherd in the config * file value "binding". If there is "NULL" no core binding is done. * * This function is Linux specific. * * If there is any instruction the bookkeeping for these cores is already * done. In case of Solaris the processor set is already created by * execution daemon. Hence shepherd has just to add itself to it. * In case of Linux the whole binding is done by shepherd. In each case * the binding is inherited from shepherd to the job it starts. * * DG TODO change return value to bool * * RESULT * int - Returns 0 in case of success and a negative value in case of problems. * * NOTES * MT-NOTE: do_core_binding() is not MT safe * *******************************************************************************/ int do_core_binding(void) { /* Check if "binding" parameter in 'config' file * is available and not set to "binding=no_job_binding". * If so, we do an early abortion. */ char *binding = get_conf_val("binding"); binding_type_t type; if (binding == NULL || strcasecmp("NULL", binding) == 0) { shepherd_trace("do_core_binding: \"binding\" parameter not found in config file"); return -1; } if (strcasecmp("no_job_binding", binding) == 0) { shepherd_trace("do_core_binding: skip binding - no core binding configured"); return -1; } /* get the binding type (set = 0 | env = 1 | pe = 2) where default is 0 */ type = binding_parse_type(binding); /* do a binding accorting the strategy */ if (strstr(binding, "linear") != NULL) { /* do a linear binding */ int amount; int socket; int core; shepherd_trace("do_core_binding: do linear"); /* get the amount of cores to bind on */ if ((amount = binding_linear_parse_amount(binding)) < 0) { shepherd_trace("do_core_binding: couldn't parse the amount of cores from config file"); return -1; } /* get the socket to begin binding with (choosen by execution daemon) */ if ((socket = binding_linear_parse_socket_offset(binding)) < 0) { shepherd_trace("do_core_binding: couldn't get the socket number from config file"); return -1; } /* get the core to begin binding with (choosen by execution daemon) */ if ((core = binding_linear_parse_core_offset(binding)) < 0) { shepherd_trace("do_core_binding: couldn't get the core number from config file"); return -1; } /* perform core binding on current process */ if (binding_set_linear_linux(socket, core, amount, 1, type) == false) { /* core binding was not successful */ if (type == BINDING_TYPE_SET) { shepherd_trace("do_core_binding: linear binding was not successful"); } else if (type == BINDING_TYPE_ENV) { shepherd_trace("do_core_binding: couldn't set SGE_BINDING environment variable"); } else if (type == BINDING_TYPE_PE) { shepherd_trace("do_core_binding: couldn't produce rankfile"); } } else { if (type == BINDING_TYPE_SET) { shepherd_trace("do_core_binding: job successfully bound"); } else if (type == BINDING_TYPE_ENV) { shepherd_trace("do_core_binding: SGE_BINDING environment variable created"); } else if (type == BINDING_TYPE_PE) { shepherd_trace("do_core_binding: rankefile produced"); } } } else if (strstr(binding, "striding") != NULL) { int amount = binding_striding_parse_amount(binding); int stepsize = binding_striding_parse_step_size(binding); /* these are the real start parameters */ int first_socket = 0, first_core = 0; shepherd_trace("do_core_binding: striding"); if (amount <= 0) { shepherd_trace("do_core_binding: error parsing <amount>"); return -1; } if (stepsize < 0) { shepherd_trace("do_core_binding: error parsing <stepsize>"); return -1; } first_socket = binding_striding_parse_first_socket(binding); if (first_socket < 0) { shepherd_trace("do_core_binding: error parsing <socket>"); return -1; } first_core = binding_striding_parse_first_core(binding); if (first_core < 0) { shepherd_trace("do_core_binding: error parsing <core>"); return -1; } /* last core has to be incremented because core 0 is first core to be used */ if (stepsize == 0) { /* stepsize must be >= 1 */ stepsize = 1; } shepherd_trace("do_core_binding: striding set binding: first_core: %d first_socket %d amount %d stepsize %d", first_core, first_socket, amount, stepsize); /* get the first core and first socket which is available for striding */ /* perform core binding on current process */ if (binding_set_striding_linux(first_socket, first_core, amount, 0, stepsize, type)) { shepherd_trace("do_core_binding: striding: binding done"); } else { shepherd_trace("do_core_binding: striding: binding not done"); } } else if (strstr(binding, "explicit") != NULL) { /* list with the sockets (first part of the <socket>,<core> tuples) */ int* sockets = NULL; /* length of sockets list */ int nr_of_sockets = 0; /* list with the cores to be bound on the sockets */ int* cores = NULL; /* length of cores list */ int nr_of_cores = 0; shepherd_trace("do_core_binding: explicit"); /* get <socket>,<core> pairs out of binding string */ if (binding_explicit_extract_sockets_cores(binding, &sockets, &nr_of_sockets, &cores, &nr_of_cores) == true) { if (nr_of_sockets == 0 && nr_of_cores == 0) { /* no cores and no sockets are found */ shepherd_trace("do_core_binding: explicit: no socket or no core was specified"); } else if (nr_of_sockets != nr_of_cores) { shepherd_trace("do_core_binding: explicit: unequal amount of specified sockets and cores"); } else { /* do core binding according the <socket>,<core> tuples */ if (binding_explicit(sockets, nr_of_sockets, cores, nr_of_cores, type) == true) { shepherd_trace("do_core_binding: explicit: binding done"); } else { shepherd_trace("do_core_binding: explicit: no core binding done"); } } sge_free(&sockets); sge_free(&cores); } else { sge_free(&sockets); sge_free(&cores); shepherd_trace("do_core_binding: explicit: couldn't extract <socket>,<core> pair"); } } else { if (binding != NULL) { shepherd_trace("do_core_binding: WARNING: unknown \"binding\" parameter: %s", binding); } else { shepherd_trace("do_core_binding: WARNING: binding was null!"); } } shepherd_trace("do_core_binding: finishing"); return 0; }
/****** shepherd_binding/binding_set_striding_linux() ************************************* * NAME * binding_set_striding_linux() -- Binds current process to cores. * * SYNOPSIS * bool binding_set_striding_linux(int first_socket, int first_core, int * amount_of_cores, int offset, int stepsize) * * FUNCTION * Performs a core binding for the calling process according to the * 'striding' strategy. The first core used is specified by first_socket * (beginning with 0) and first_core (beginning with 0). If first_core is * greater than available cores on first_socket, the next socket is examined * and first_core is reduced by the skipped cores. If the first_core could * not be found on system (because it was to high) no binding will be done. * * If the first core was choosen the next one is defined by the step size 'n' * which is incremented to the first core found. If the socket has not the * core (because it was the last core of the socket for example) the next * socket is examined. * * If the system is out of cores and there are still some cores to select * (because of the amount_of_cores parameter) no core binding will be performed. * * INPUTS * int first_socket - first socket to begin with * int first_core - first core to start with * int amount_of_cores - total amount of cores to be used * int offset - core offset for first core (increments first core used) * int stepsize - step size * int type - type of binding (set or env or pe) * * RESULT * bool - Returns true if the binding was performed, otherwise false. * * NOTES * MT-NOTE: binding_set_striding() is MT safe * *******************************************************************************/ bool binding_set_striding_linux(int first_socket, int first_core, int amount_of_cores, int offset, int stepsize, const binding_type_t type) { /* n := take every n-th core */ bool bound = false; dstring error = DSTRING_INIT; if (_has_core_binding(&error) == true) { sge_dstring_free(&error); /* bitmask for processors to turn on and off */ plpa_cpu_set_t cpuset; /* turn off all processors */ PLPA_CPU_ZERO(&cpuset); /* when library offers architecture: - get virtual processor ids in the following manner: * on socket "first_socket" choose core number "first_core + offset" * then add n: if core is not available go to next socket * ... */ if (_has_topology_information()) { /* amount of cores set in processor binding mask */ int cores_set = 0; /* next socket to use */ int next_socket = first_socket; /* next core to use */ int next_core = first_core + offset; /* all the processor ids selected for the mask */ int* proc_id = NULL; int proc_id_size = 0; /* maximal amount of sockets on this system */ int max_amount_of_sockets = get_amount_of_plpa_sockets(); /* check if we are already out of range */ if (next_socket >= max_amount_of_sockets) { shepherd_trace("binding_set_striding_linux: already out of sockets"); return false; } while (get_amount_of_plpa_cores(next_socket) <= next_core) { /* move on to next socket - could be that we have to deal only with cores instead of <socket><core> tuples */ next_core -= get_amount_of_plpa_cores(next_socket); next_socket++; if (next_socket >= max_amount_of_sockets) { /* we are out of sockets - we do nothing */ shepherd_trace("binding_set_striding_linux: first core: out of sockets"); return false; } } add_proc_ids_linux(next_socket, next_core, &proc_id, &proc_id_size); /* turn on processor id in mask */ /* collect the rest of the processor ids */ for (cores_set = 1; cores_set < amount_of_cores; cores_set++) { /* calculate next_core number */ next_core += stepsize; /* check if we are already out of range */ if (next_socket >= max_amount_of_sockets) { shepherd_trace("binding_set_striding_linux: out of sockets"); sge_free(&proc_id); return false; } while (get_amount_of_plpa_cores(next_socket) <= next_core) { /* move on to next socket - could be that we have to deal only with cores instead of <socket><core> tuples */ next_core -= get_amount_of_plpa_cores(next_socket); next_socket++; if (next_socket >= max_amount_of_sockets) { /* we are out of sockets - we do nothing */ shepherd_trace("binding_set_striding_linux: out of sockets!"); sge_free(&proc_id); return false; } } /* add processor ids for core */ add_proc_ids_linux(next_socket, next_core, &proc_id, &proc_id_size); } /* collecting processor ids */ /* set the mask for all processor ids */ set_processor_binding_mask(&cpuset, proc_id, proc_id_size); if (type == BINDING_TYPE_PE) { /* rankfile is created: do nothing */ } else if (type == BINDING_TYPE_ENV) { /* set the environment variable */ if (create_binding_env_linux(proc_id, proc_id_size) == true) { shepherd_trace("binding_set_striding_linux: SGE_BINDING env var created"); } else { shepherd_trace("binding_set_striding_linux: problems while creating SGE_BINDING env"); } } else { /* bind process to mask */ if (bind_process_to_mask((pid_t) 0, cpuset) == true) { /* there was an error while binding */ bound = true; } } sge_free(&proc_id); } else { /* setting bitmask without topology information which could not be right? */ shepherd_trace("binding_set_striding_linux: bitmask without topology information"); return false; } } else { /* has no core binding feature */ sge_dstring_free(&error); return false; } return bound; }
static int check_config(lList **alpp, lListElem *conf) { lListElem *ep; const char *name, *value; const char *conf_name; DENTER(TOP_LAYER, "check_config"); conf_name = lGetHost(conf, CONF_name); for_each(ep, lGetList(conf, CONF_entries)) { name = lGetString(ep, CF_name); value = lGetString(ep, CF_value); if (name == NULL) { ERROR((SGE_EVENT, MSG_CONF_NAMEISNULLINCONFIGURATIONLISTOFX_S, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } if (value == NULL) { ERROR((SGE_EVENT, MSG_CONF_VALUEISNULLFORATTRXINCONFIGURATIONLISTOFY_SS, name, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } if (!strcmp(name, "loglevel")) { u_long32 tmp_uval; if (sge_parse_loglevel_val(&tmp_uval, value) != 1) { ERROR((SGE_EVENT, MSG_CONF_GOTINVALIDVALUEXFORLOGLEVEL_S, value)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } else if (strcmp(name, "jsv_url") == 0) { if (strcasecmp("none", value) != 0) { dstring input = DSTRING_INIT; dstring type = DSTRING_INIT; dstring user = DSTRING_INIT; dstring path = DSTRING_INIT; bool lret = true; sge_dstring_append(&input, value); lret = jsv_url_parse(&input, alpp, &type, &user, &path, false); sge_dstring_free(&input); sge_dstring_free(&type); sge_dstring_free(&user); sge_dstring_free(&path); if (!lret) { /* answer is written by jsv_url_parse */ DRETURN(STATUS_EEXIST); } } } else if (!strcmp(name, "shell_start_mode")) { if ((strcasecmp("unix_behavior", value) != 0) && (strcasecmp("posix_compliant", value) != 0) && (strcasecmp("script_from_stdin", value) != 0) ) { ERROR((SGE_EVENT, MSG_CONF_GOTINVALIDVALUEXFORSHELLSTARTMODE_S, value)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "shell")) { if (!path_verify(name, alpp, "shell", true)) { ERROR((SGE_EVENT, MSG_CONF_GOTINVALIDVALUEXFORSHELL_S, value)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "load_report_time")) { /* do not allow infinity entry for load_report_time */ if (strcasecmp(value, "infinity") == 0) { ERROR((SGE_EVENT, MSG_CONF_INFNOTALLOWEDFORATTRXINCONFLISTOFY_SS, name, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "max_unheard")) { /* do not allow infinity entry */ if (strcasecmp(value,"infinity") == 0) { ERROR((SGE_EVENT, MSG_CONF_INFNOTALLOWEDFORATTRXINCONFLISTOFY_SS, name, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "admin_user")) { struct passwd pw_struct; char *buffer; int size; size = get_pw_buffer_size(); buffer = sge_malloc(size); if (strcasecmp(value, "none") && !sge_getpwnam_r(value, &pw_struct, buffer, size)) { ERROR((SGE_EVENT, MSG_CONF_GOTINVALIDVALUEXASADMINUSER_S, value)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); sge_free(&buffer); DRETURN(STATUS_EEXIST); } sge_free(&buffer); } else if (!strcmp(name, "user_lists")||!strcmp(name, "xuser_lists")) { lList *tmp = NULL; int ok; /* parse just for .. */ if (lString2ListNone(value, &tmp, US_Type, US_name, " \t,")) { ERROR((SGE_EVENT, MSG_CONF_FORMATERRORFORXINYCONFIG_SS, name, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } /* .. checking userset names */ ok = (userset_list_validate_acl_list(tmp, alpp) == STATUS_OK); lFreeList(&tmp); if (!ok) { DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "projects") || !strcmp(name, "xprojects")) { lList *tmp = NULL; int ok=1; /* parse just for .. */ if (lString2ListNone(value, &tmp, PR_Type, PR_name, " \t,")) { ERROR((SGE_EVENT, MSG_CONF_FORMATERRORFORXINYCONFIG_SS, name, conf_name)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } /* .. checking project names */ ok = (verify_project_list(alpp, tmp, *object_type_get_master_list(SGE_TYPE_PROJECT), name, "configuration", conf_name)==STATUS_OK); lFreeList(&tmp); if (!ok) { DRETURN(STATUS_EEXIST); } } else if (!strcmp(name, "prolog") || !strcmp(name, "epilog") || !strcmp(name, "mailer")) { if (strcasecmp(value, "none")) { const char *t, *script = value; /* skip user name */ if ((t = strpbrk(script, "@ ")) && *t == '@') script = &t[1]; /* force use of absolute paths if string <> none */ if (script[0] != '/' ) { ERROR((SGE_EVENT, MSG_CONF_THEPATHGIVENFORXMUSTSTARTWITHANY_S, name)); answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } /* ensure that variables are valid */ if (replace_params(script, NULL, 0, prolog_epilog_variables)) { ERROR((SGE_EVENT, MSG_CONF_PARAMETERXINCONFIGURATION_SS, name, err_msg)); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } } else if (!strcmp(name, "auto_user_oticket") || !strcmp(name, "auto_user_fshare")) { u_long32 uval = 0; if (!extended_parse_ulong_val(NULL, &uval, TYPE_INT, value, NULL, 0, 0, true)) { ERROR((SGE_EVENT, MSG_CONF_FORMATERRORFORXINYCONFIG_SS, name, value ? value : "(NULL)")); answer_list_add(alpp, SGE_EVENT, STATUS_EEXIST, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EEXIST); } } /* * check paths, see also CR 6506580. * The following must be none or a valid absolute path: * - load_sensor * - set_token_cmd * - pag_cmd * - shepherd_cmd * * The following must be a valid absolute path: * - mailer * - xterm * - *_daemon, may also be "builtin" */ else if (strcmp(name, "set_token_cmd") == 0 || strcmp(name, "pag_cmd") == 0 || strcmp(name, "shepherd_cmd") == 0) { if (strcasecmp(value, "none") != 0) { if (!path_verify(value, alpp, name, true)) { answer_list_log(alpp, false, false); DRETURN(STATUS_EEXIST); } } } else if (strcmp(name, "mailer") == 0 || strcmp(name, "xterm") == 0) { if (!path_verify(value, alpp, name, true)) { answer_list_log(alpp, false, false); DRETURN(STATUS_EEXIST); } } else if (strcmp(name, "qlogin_daemon") == 0 || strcmp(name, "rlogin_daemon") == 0 || strcmp(name, "rsh_daemon") == 0) { if (strcasecmp(value, "builtin") != 0) { if (!path_verify(value, alpp, name, true)) { answer_list_log(alpp, false, false); DRETURN(STATUS_EEXIST); } } } /* load_sensor is a comma separated list of scripts */ else if (strcmp(name, "load_sensor") == 0 && strcasecmp(value, "none") != 0) { struct saved_vars_s *context = NULL; const char *path = sge_strtok_r(value, ",", &context); do { if (!path_verify(path, alpp, name, true)) { answer_list_log(alpp, false, false); sge_free_saved_vars(context); DRETURN(STATUS_EEXIST); } } while ((path = sge_strtok_r(NULL, ",", &context)) != NULL); sge_free_saved_vars(context); } }
/****** cull/db/lJoinSublist() ************************************************ * NAME * lJoinSublist() -- Join a list with one of its sublists * * SYNOPSIS * lList* lJoinSublist(const char *name, * int nm0, * const lList *lp, * const lCondition *cp0, * const lEnumeration *enp0, * const lDescr *sldp, * const lCondition *cp1, * const lEnumeration *enp1) * * FUNCTION * Joins a list and one of its sublists together. The other * parameters are equal to them from lJoin(). In the enumeration * 'enp0' the sublist field neither may be selected nor 'enp0' * may be NULL. * * INPUTS * const char *name - new list name * int nm0 - * const lList *lp - list * const lCondition *cp0 - selects rows within 'lp' * const lEnumeration *enp0 - selects columns within 'lp' * const lDescr *sldp - sublist descriptor pointer * const lCondition *cp1 - selects rows within 'sldp' * const lEnumeration *enp1 - selects columns within 'enp1' * * RESULT * lList* - Joined list ******************************************************************************/ lList *lJoinSublist(const char *name, int nm0, const lList *lp, const lCondition *cp0, const lEnumeration *enp0, const lDescr *sldp, const lCondition *cp1, const lEnumeration *enp1) { lList *dlp, *tlp, *joinedlist, *sublist; lListElem *ep; lDescr *dp; const lDescr *tdp; int i, pos; DENTER(CULL_LAYER, "lJoinSublist"); /* check different pointers */ if (!name || !lp || !enp0 || !sldp || !enp1) { LERROR(LENULLARGS); DEXIT; return NULL; } /* make sure that nm0 is a sublist field of lp */ if (!(tdp = lGetListDescr(lp))) { LERROR(LEDESCRNULL); DEXIT; return NULL; } if ((pos = lGetPosInDescr(tdp, nm0)) < 0) { LERROR(LENAMENOT); DEXIT; return NULL; } if (mt_get_type(tdp[pos].mt) != lListT) { LERROR(LEINCTYPE); DEXIT; return NULL; } /* is nm0 enumerated in enp0 ? */ if (enp0[0].pos == WHAT_ALL) { LERROR(LEFALSEFIELD); DEXIT; return NULL; } for (i = 0; enp0[i].nm != NoName; i++) if (enp0[i].nm == nm0) { LERROR(LEFALSEFIELD); DEXIT; return NULL; } /* create destination list */ if (!(dp = lJoinDescr(lGetListDescr(lp), sldp, enp0, enp1))) { LERROR(LEJOINDESCR); DEXIT; return NULL; } if (!(dlp = lCreateList(name, dp))) { sge_free(&dp); LERROR(LECREATELIST); DEXIT; return NULL; } /* free dp it has been copied in lCreateList */ sge_free(&dp); /* create a temporary list to be used by lJoin */ if (!(tlp = lCreateList("lJoinSublist: tlp", lGetListDescr(lp)))) { lFreeList(&dlp); LERROR(LECREATELIST); DEXIT; return NULL; } for_each_where(ep, lp, cp0) { /* is there a sublist for the join */ if ((sublist = lGetList(ep, nm0)) != NULL) { /* put each element in the tlp to be used by lJoin */ if (lAppendElem(tlp, lCopyElem(ep)) == -1) { lFreeList(&tlp); lFreeList(&dlp); LERROR(LEAPPENDELEM); DEXIT; return NULL; } /* join the tlp with one element together with its sublist */ joinedlist = lJoin("lJoinSublist: joinedlist", nm0, tlp, NULL, enp0, NoName, sublist, cp1, enp1); if (!joinedlist) { lFreeList(&tlp); lFreeList(&dlp); LERROR(LEJOIN); DEXIT; return NULL; } /* joinedlist is freed in lAddList */ if (joinedlist && lAddList(dlp, &joinedlist) == -1) { LERROR(LEADDLIST); lFreeList(&tlp); lFreeList(&dlp); DEXIT; return NULL; } /* dechain the only element from tlp and free it (copy) */ lRemoveElem(tlp, &(tlp->first)); } } /* temporary list has to be freed */ lFreeList(&tlp); /* RETURN AN EMPTY LIST OR NULL THAT'S THE QUESTION */ if (lGetNumberOfElem(dlp) == 0) { lFreeList(&dlp); } DEXIT; return dlp; }
int loadvalue_update_load(t_loadvalues *loadvalue, t_pdhquery *query, t_pdhcounterset *counter_state, t_pdhcounterset *counter_pid) { static BOOL initialized = FALSE; int ret = 0; DWORD local_ret = 0; DENTER("loadvalue_update_load"); if (!initialized) { local_ret = pdhquery_initialize(query); initialized = TRUE; } if (local_ret == 0) { local_ret = pdhcounterset_initialize(counter_state, "Thread", "*", "Thread State"); if (local_ret == 0) { local_ret = pdhquery_add_counterset(query, counter_state); if (local_ret == 0) { local_ret = pdhcounterset_initialize(counter_pid, "Thread", "*", "ID Process"); if (local_ret == 0) { local_ret = pdhquery_add_counterset(query, counter_pid); if (local_ret != 0) { // error handling ret = 5; } } else { // error handling ret = 4; } } else { // error handling ret = 3; } } else { // error handling ret = 2; } } else { // error handling ret = 1; } if (ret != 0) { return ret; } /* We are here - no error occured during initialisation */ local_ret = pdhquery_update(query); if (local_ret == 0) { DWORD state[8]; DWORD size; BOOL *is_done; /* * State Descrition * -------- ----------------------------------------------------------- * 0 Initialized * 1 Ready (Bereit) * Waiting for a Processor * 2 Running (Wird ausgefuehrt) * Currently uses a processor * 3 Standby (Standy) * Will get a processor soon * 4 Terminated (Abgebrochen) * 5 Waiting (Wartend) * Waiting for a peripheral process or resource * 6 Transition (Uebergang) * Is waiting for a resource (swapspace ...) * 7 Unknown (Unbekannt) */ memset(state, 0, sizeof(DWORD) * 8); size = counter_state->number_of_counters * sizeof(BOOL); is_done = (BOOL*) malloc(size); memset(is_done, 0, size); if (is_done != NULL) { PDH_FMT_COUNTERVALUE state_id; PDH_FMT_COUNTERVALUE pid; DWORD j, k; DWORD count; #if 0 fprintf(stderr, "\n\n"); fflush(stderr); #endif for (j = 0; j < counter_state->number_of_counters; j++) { local_ret = PdhGetFormattedCounterValue( counter_state->counter_handles[j], PDH_FMT_LONG, NULL, &state_id); if (local_ret == 0) { local_ret = PdhGetFormattedCounterValue( counter_pid->counter_handles[j], PDH_FMT_LONG, NULL, &pid); if (state_id.longValue == 1 || state_id.longValue == 2) { #if 0 fprintf(stderr, "%50s\t%d\t%ld\t%ld\n", counter_state->pdh_name[j], j, state_id.longValue, pid.longValue); fflush(stderr); #endif if (is_done[j] == FALSE) { state[state_id.longValue]++; for (k = j; k < counter_state->number_of_counters; k++) { PDH_FMT_COUNTERVALUE pid2; local_ret = PdhGetFormattedCounterValue( counter_pid->counter_handles[k], PDH_FMT_LONG, NULL, &pid2); if (local_ret == 0) { if (pid2.longValue == pid.longValue) { is_done[k] = TRUE; } } else { if (pid2.CStatus == PDH_CSTATUS_NO_INSTANCE) { /* * It might be possible that the underlaying * instance was deleted meanwile (no error!) */ ; } else { // error handling ret = 13; } } } #if 0 fprintf(stderr, "\tC\n"); fflush(stderr); #endif } else { #if 0 fprintf(stderr, "\tR\n"); fflush(stderr); #endif } } } else { if (state_id.CStatus == PDH_CSTATUS_NO_INSTANCE) { /* * It might be possible that the underlaying * instance was deleted meanwile (no error!) */ ; } else { // error handling ret = 12; } } } sge_free(&is_done); #if 0 for (j = 0; j < 8; j++) { fprintf(stderr, "state %d: %d\n", j, state[j]); fflush(stderr); } #endif /* * the idle thread and the loadsensor itself * have the state 2 if we collect data. These values * should not influence the loadaverage. */ count = state[1] + state[2]; if (count >= 2) { count -= 2; } else { count = 0; } local_ret = WaitForSingleObject(loadvalue_mutex, INFINITE); if (local_ret == WAIT_OBJECT_0) { get_current_load(loadvalue->load_avg, count); ReleaseMutex(loadvalue_mutex); } } else { // error handling ret = 11; } // error handling } else { // error handling ret = 10; } local_ret = pdhquery_remove_counterset(query, counter_state); local_ret = pdhquery_remove_counterset(query, counter_pid); DEXIT; return ret; }
/****** sge_order/sge_free_cull_order_pos() ************************************ * NAME * sge_free_cull_order_pos() -- frees a cull order struct * * SYNOPSIS * void sge_free_cull_order_pos(order_pos_t **cull_order_pos) * * FUNCTION * frees a cull order struct * * INPUTS * order_pos_t **cull_order_pos - a douple pointer to the struct. Will be * set to NULL * * NOTES * MT-NOTE: sge_free_cull_order_pos() is MT safe * *******************************************************************************/ void sge_free_cull_order_pos(order_pos_t **cull_order_pos) { sge_free(cull_order_pos); }
/****** Interactive/qrsh/setEnvironment() *************************************** * * NAME * setEnvironment() -- set environment from file * * SYNOPSIS * static char *setEnvironment(const char *jobdir, char **wrapper); * * FUNCTION * Reads environment variables and their values from file <envFileName> * and sets them in the actual process environment. * The file format conforms to the sge environment file format: * Each line contains a tuple: * <name>=<value> * Special handling for variable PWD: tries to change to named * directory. * Special handling for variable QRSH_COMMAND: is the command to be executed * by qrsh_starter. The value of this variable will be returned as command, * or NULL, if an error occurs. * Special handling for variable QRSH_WRAPPER: this is a wrapper to be called * instead of a shell to execute the command. * If this variable is contained in the environment, it will be returned in * the parameter wrapper. Memory will be allocated to hold the variable, it * is in the responsibility of the caller to free this memory. * Special handling for variable DISPLAY: if it is already set, do not * overwrite it. Usually it is not set, but if ssh is used as transport * mechanism for qrsh, the ssh -X option can be used to enable * X11 forwarding. * * INPUTS * jobdir - the jobs spool directory * wrapper - buffer to take the path and name of a wrapper script * * RESULT * command, if all actions could be performed * NULL, if an error occured; possible errors are: * - the environment file cannot be opened * - a PWD entry is found, but changing to the named directory fails * - necessary memory cannot be allocated * - the variable QRSH_COMMAND is not found * **************************************************************************** */ static char *setEnvironment(const char *jobdir, char **wrapper) { char envFileName[SGE_PATH_MAX]; FILE *envFile = NULL; char *line = NULL; char *command = NULL; SGE_STRUCT_STAT statbuf; int size; bool set_display = true; *wrapper = NULL; /* don't set DISPLAY, if it is already set (e.g. by ssh) */ if (getenv("DISPLAY") != NULL) { set_display = false; } snprintf(envFileName, SGE_PATH_MAX, "%s/environment", jobdir); /* check if environment file exists and * retrieve file size. We will take file size as maximum possible line length */ if (SGE_STAT(envFileName, &statbuf) != 0) { qrsh_error(MSG_QRSH_STARTER_CANNOTOPENFILE_SS, envFileName, strerror(errno)); return NULL; } size = statbuf.st_size; line = (char *)malloc(size + 1); if (line == NULL) { qrsh_error(MSG_QRSH_STARTER_MALLOCFAILED_S, strerror(errno)); return NULL; } /* open sge environment file */ if ((envFile = fopen(envFileName, "r")) == NULL) { qrsh_error(MSG_QRSH_STARTER_CANNOTOPENFILE_SS, envFileName, strerror(errno)); sge_free(&line); return NULL; } /* set all environment variables, change to directory named by PWD */ while (fgets(line, size, envFile) != NULL) { /* clean trailing garbage (\n, \r, EOF ...) */ char *c = &line[strlen(line)]; while (iscntrl(*(--c))) { *c = 0; } /* skip setting of display variable */ if (strncmp(line, "DISPLAY=", 8) == 0 && !set_display) { continue; } if (strncmp(line, "QRSH_COMMAND=", 13) == 0) { if ((command = (char *)malloc(strlen(line) - 13 + 1)) == NULL) { qrsh_error(MSG_QRSH_STARTER_MALLOCFAILED_S, strerror(errno)); sge_free(&line); FCLOSE(envFile); return NULL; } strcpy(command, line + 13); } else if (strncmp(line, "QRSH_WRAPPER=", 13) == 0) { if (*(line + 13) == 0) { fprintf(stderr, "%s\n", MSG_QRSH_STARTER_EMPTY_WRAPPER); } else { if ((*wrapper = (char *)malloc(strlen(line) - 13 + 1)) == NULL) { qrsh_error(MSG_QRSH_STARTER_MALLOCFAILED_S, strerror(errno)); sge_free(&line); FCLOSE(envFile); return NULL; } strcpy(*wrapper, line + 13); } } else { const char *new_line = sge_replace_substring(line, "\\n", "\n"); int put_ret; /* set variable */ if (new_line != NULL) { put_ret = sge_putenv(new_line); sge_free(&new_line); } else { put_ret = sge_putenv(line); } if (put_ret == 0) { sge_free(&line); FCLOSE(envFile); return NULL; } } } sge_free(&line); FCLOSE(envFile); /* * Use starter_method if it is supplied * and not overridden by QRSH_WRAPPER */ if (*wrapper == NULL) { char *starter_method = get_conf_val("starter_method"); if (starter_method != NULL && strcasecmp(starter_method, "none") != 0) { char buffer[128]; *wrapper = starter_method; snprintf(buffer, 128, "%s=%s", "SGE_STARTER_SHELL_PATH", ""); sge_putenv(buffer); snprintf(buffer, 128, "%s=%s", "SGE_STARTER_SHELL_START_MODE", "unix_behavior"); sge_putenv(buffer); snprintf(buffer, 128, "%s=%s", "SGE_STARTER_USE_LOGIN_SHELL", "false"); sge_putenv(buffer); } } return command; FCLOSE_ERROR: qrsh_error(MSG_FILE_ERRORCLOSEINGXY_SS, envFileName, strerror(errno)); return NULL; }
/****** uti/string/sge_strtok() *********************************************** * NAME * sge_strtok() -- Replacement for strtok() * * SYNOPSIS * char* sge_strtok(const char *str, const char *delimitor) * * FUNCTION * Replacement for strtok(). If no delimitor is given * isspace() is used. * * INPUTS * const char *str - string which should be tokenized * const char *delimitor - delimitor string * * RESULT * char* - first/next token of str. * * NOTES * MT-NOTE: sge_strtok() is not MT safe, use sge_strtok_r() instead * * SEE ALSO * uti/string/sge_strtok_r() ******************************************************************************/ char *sge_strtok(const char *str, const char *delimitor) { char *cp; char *saved_cp; static char *static_cp = NULL; static char *static_str = NULL; static unsigned int alloc_len = 0; unsigned int n; bool done; DENTER(BASIS_LAYER, "sge_strtok"); if (str) { n = strlen(str); if (static_str) { if (n > alloc_len) { /* need more memory */ sge_free(&static_str); static_str = malloc(n + 1); alloc_len = n; } } else { static_str = malloc(n + 1); alloc_len = n; } strcpy(static_str, str); saved_cp = static_str; } else { saved_cp = static_cp; } /* seek first character which is no '\0' and no delimitor */ done = false; while (!done) { /* found end of string */ if (saved_cp == NULL || *saved_cp == '\0') { DRETURN(NULL); } /* eat white spaces */ if (!IS_DELIMITOR((int) saved_cp[0], delimitor)) { done = true; break; } saved_cp++; } /* seek end of string given by '\0' or delimitor */ cp = saved_cp; done = false; while (!done) { if (!cp[0]) { static_cp = cp; DRETURN(saved_cp); } /* test if we found a delimitor */ if (IS_DELIMITOR((int) cp[0], delimitor)) { cp[0] = '\0'; cp++; static_cp = cp; DRETURN(saved_cp); } cp++; } DRETURN(NULL); }
int cl_host_list_setup(cl_raw_list_t** list_p, char* list_name, cl_host_resolve_method_t method, char* host_alias_file, char* local_domain_name, unsigned long entry_life_time, unsigned long entry_update_time, unsigned long entry_reresolve_time, bool create_hash) { int ret_val = CL_RETVAL_OK; cl_host_list_data_t* ldata = NULL; ldata = (cl_host_list_data_t*) malloc(sizeof(cl_host_list_data_t)); if (ldata == NULL ) { return CL_RETVAL_MALLOC; } ldata->host_alias_file = NULL; ldata->alias_file_changed = 0; ldata->host_alias_list = NULL; ldata->resolve_method = method; ldata->entry_life_time = entry_life_time; ldata->entry_update_time = entry_update_time; ldata->entry_reresolve_time = entry_reresolve_time; ldata->last_refresh_time = 0; if (local_domain_name == NULL && method == CL_LONG) { CL_LOG(CL_LOG_WARNING,"can't compare short host names without default domain when method is CL_LONG"); } if (entry_life_time == 0) { unsigned long help_value = 0; help_value = cl_util_get_ulong_value(getenv("SGE_COMMLIB_HOST_LIST_LIFE_TIME")); if (help_value > 0) { CL_LOG(CL_LOG_INFO,"environment variable SGE_COMMLIB_HOST_LIST_LIFE_TIME is set"); ldata->entry_life_time = help_value; } else { CL_LOG(CL_LOG_INFO,"using default value for entry_life_time"); ldata->entry_life_time = CL_HOST_LIST_DEFAULT_LIFE_TIME; } } if (entry_update_time == 0) { unsigned long help_value = 0; help_value = cl_util_get_ulong_value(getenv("SGE_COMMLIB_HOST_LIST_UPDATE_TIME")); if (help_value > 0) { CL_LOG(CL_LOG_INFO,"environment variable SGE_COMMLIB_HOST_LIST_UPDATE_TIME is set"); ldata->entry_update_time = help_value; } else { CL_LOG(CL_LOG_INFO,"using default value for entry_update_time"); ldata->entry_update_time = CL_HOST_LIST_DEFAULT_UPDATE_TIME; } } if (entry_reresolve_time == 0) { unsigned long help_value = 0; help_value = cl_util_get_ulong_value(getenv("SGE_COMMLIB_HOST_LIST_RERESOLVE_TIME")); if (help_value > 0) { CL_LOG(CL_LOG_INFO,"environment variable SGE_COMMLIB_HOST_LIST_RERESOLVE_TIME is set"); ldata->entry_reresolve_time = help_value; } else { CL_LOG(CL_LOG_INFO,"using default value for entry_reresolve_time"); ldata->entry_reresolve_time = CL_HOST_LIST_DEFAULT_RERESOLVE_TIME; } } if ( ldata->entry_life_time > CL_HOST_LIST_MAX_LIFE_TIME) { CL_LOG_INT(CL_LOG_WARNING,"entry_life_time exceeds maximum of",CL_HOST_LIST_MAX_LIFE_TIME); CL_LOG(CL_LOG_WARNING,"using default value for entry_life_time"); ldata->entry_life_time = CL_HOST_LIST_DEFAULT_LIFE_TIME; } if ( ldata->entry_update_time > CL_HOST_LIST_MAX_UPDATE_TIME) { CL_LOG_INT(CL_LOG_WARNING,"entry_update_time exceeds maximum of",CL_HOST_LIST_MAX_UPDATE_TIME); CL_LOG(CL_LOG_WARNING,"using default value for entry_update_time"); ldata->entry_update_time = CL_HOST_LIST_DEFAULT_UPDATE_TIME; } if ( ldata->entry_reresolve_time > CL_HOST_LIST_MAX_RERESOLVE_TIME) { CL_LOG_INT(CL_LOG_WARNING,"entry_reresolve_time exceeds maximum of",CL_HOST_LIST_MAX_RERESOLVE_TIME); CL_LOG(CL_LOG_WARNING,"using default value for entry_reresolve_time"); ldata->entry_reresolve_time = CL_HOST_LIST_DEFAULT_RERESOLVE_TIME; } if (ldata->entry_life_time <= ldata->entry_update_time || ldata->entry_life_time <= ldata->entry_reresolve_time) { sge_free(&ldata); CL_LOG(CL_LOG_ERROR,"entry_life_time must be >= entry_update_time and >= entry_reresolve_time"); cl_commlib_push_application_error(CL_LOG_ERROR, CL_RETVAL_PARAMS, "SGE_COMMLIB_HOST_LIST_LIFE_TIME must be >= SGE_COMMLIB_HOST_LIST_UPDATE_TIME and >= SGE_COMMLIB_HOST_LIST_RERESOLVE_TIME"); return CL_RETVAL_PARAMS; } if (ldata->entry_update_time <= ldata->entry_reresolve_time) { sge_free(&ldata); CL_LOG(CL_LOG_ERROR,"entry_update_time must be >= entry_reresolve_time"); cl_commlib_push_application_error(CL_LOG_ERROR, CL_RETVAL_PARAMS, "SGE_COMMLIB_HOST_LIST_UPDATE_TIME must be >= SGE_COMMLIB_HOST_LIST_RERESOLVE_TIME"); return CL_RETVAL_PARAMS; } ret_val = cl_host_alias_list_setup(&(ldata->host_alias_list), "host alias list"); if (ret_val != CL_RETVAL_OK) { sge_free(&ldata); CL_LOG(CL_LOG_ERROR,"error setting up host alias list"); return ret_val; } if (host_alias_file != NULL) { ldata->host_alias_file = strdup(host_alias_file); ldata->alias_file_changed = 1; if (ldata->host_alias_file == NULL) { sge_free(&ldata); return CL_RETVAL_MALLOC; } } else { ldata->host_alias_file = NULL; } if (local_domain_name != NULL) { ldata->local_domain_name = strdup(local_domain_name); if (ldata->local_domain_name == NULL) { if (ldata->host_alias_file != NULL) { sge_free(&(ldata->host_alias_file)); } sge_free(&ldata); return CL_RETVAL_MALLOC; } } else { ldata->local_domain_name = NULL; } ret_val = cl_raw_list_setup(list_p,list_name, 1); if (ret_val != CL_RETVAL_OK) { if (ldata->host_alias_file != NULL) { sge_free(&(ldata->host_alias_file)); } if (ldata->local_domain_name != NULL) { sge_free(&(ldata->local_domain_name)); } sge_free(&ldata); return ret_val; } switch(ldata->resolve_method) { case CL_SHORT: CL_LOG(CL_LOG_INFO,"using short hostname for host compare operations"); break; case CL_LONG: CL_LOG(CL_LOG_INFO,"using long hostname for host compare operations"); break; default: CL_LOG(CL_LOG_WARNING,"undefined resolving method"); break; } if (ldata->host_alias_file != NULL) { CL_LOG_STR(CL_LOG_INFO,"using host alias file:", ldata->host_alias_file); } else { CL_LOG(CL_LOG_INFO,"no host alias file specified"); } if (ldata->local_domain_name != NULL) { CL_LOG_STR(CL_LOG_INFO,"using local domain name:", ldata->local_domain_name); } else { CL_LOG(CL_LOG_INFO,"no local domain specified"); } /* create hashtable */ if (create_hash == true) { ldata->ht = sge_htable_create(4, dup_func_string, hash_func_string, hash_compare_string); if (ldata->ht == NULL) { cl_raw_list_cleanup(list_p); if (ldata->host_alias_file != NULL) { sge_free(&(ldata->host_alias_file)); } if (ldata->local_domain_name != NULL) { sge_free(&(ldata->local_domain_name)); } sge_free(&ldata); return CL_RETVAL_MALLOC; } CL_LOG_INT(CL_LOG_INFO,"created hash table with size =", 4); } else { CL_LOG(CL_LOG_INFO,"created NO hash table!"); ldata->ht = NULL; } /* set private list data */ (*list_p)->list_data = ldata; CL_LOG_INT(CL_LOG_INFO,"entry_life_time is", (int)ldata->entry_life_time); CL_LOG_INT(CL_LOG_INFO,"entry_update_time is", (int)ldata->entry_update_time); CL_LOG_INT(CL_LOG_INFO,"entry_reresolve_time is", (int)ldata->entry_reresolve_time); return ret_val; }
/*----------------------------------------------------------------------------*/ int main(int argc, char **argv) { int heartbeat = 0; int last_heartbeat = 0; int latest_heartbeat = 0; int ret = 0; int delay = 0; time_t now, last; /* const char *cp; */ char err_str[MAX_STRING_SIZE]; char shadowd_pidfile[SGE_PATH_MAX]; dstring ds; char buffer[256]; pid_t shadowd_pid; #if 1 static int check_interval = CHECK_INTERVAL; static int get_active_interval = GET_ACTIVE_INTERVAL; static int delay_time = DELAY_TIME; static int sge_test_heartbeat = 0; char binpath[SGE_PATH_MAX]; char oldqmaster[SGE_PATH_MAX]; char shadow_err_file[SGE_PATH_MAX]; char qmaster_out_file[SGE_PATH_MAX]; #endif lList *alp = NULL; sge_gdi_ctx_class_t *ctx = NULL; DENTER_MAIN(TOP_LAYER, "sge_shadowd"); sge_dstring_init(&ds, buffer, sizeof(buffer)); /* initialize recovery control variables */ { char *s; int val; if ((s=getenv("SGE_CHECK_INTERVAL")) && sscanf(s, "%d", &val) == 1) check_interval = val; if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) && sscanf(s, "%d", &val) == 1) get_active_interval = val; if ((s=getenv("SGE_DELAY_TIME")) && sscanf(s, "%d", &val) == 1) delay_time = val; if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) && sscanf(s, "%d", &val) == 1) sge_test_heartbeat = val; } /* This needs a better solution */ umask(022); #ifdef __SGE_COMPILE_WITH_GETTEXT__ /* init language output for gettext() , it will use the right language */ sge_init_language_func((gettext_func_type) gettext, (setlocale_func_type) setlocale, (bindtextdomain_func_type) bindtextdomain, (textdomain_func_type) textdomain); sge_init_language(NULL,NULL); #endif /* __SGE_COMPILE_WITH_GETTEXT__ */ log_state_set_log_file(TMP_ERR_FILE_SHADOWD); if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) { answer_list_output(&alp); SGE_EXIT((void**)&ctx, 1); } /* AA: TODO: change this */ ctx->set_exit_func(ctx, shadowd_exit_func); sge_setup_sig_handlers(SHADOWD); #if defined(SOLARIS) /* Init shared SMF libs if necessary */ if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) { SGE_EXIT((void**)&ctx, 1); } #endif if (ctx->get_qmaster_spool_dir(ctx) != NULL) { char *shadowd_name = SGE_SHADOWD; /* is there a running shadowd on this host (with unqualified name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_unqualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } ctx->prepare_enroll(ctx); /* is there a running shadowd on this host (with aliased name) */ sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), ctx->get_qualified_hostname(ctx)); DPRINTF(("pidfilename: %s\n", shadowd_pidfile)); if ((shadowd_pid = sge_readpid(shadowd_pidfile))) { DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid))); if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) { CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid)); SGE_EXIT((void**)&ctx, 1); } } } else { ctx->prepare_enroll(ctx); } if (parse_cmdline_shadowd(argc, argv) == 1) { SGE_EXIT((void**)&ctx, 0); } if (ctx->get_qmaster_spool_dir(ctx) == NULL) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx))); SGE_EXIT((void**)&ctx, 1); } if (chdir(ctx->get_qmaster_spool_dir(ctx))) { CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx))); SGE_EXIT((void**)&ctx, 1); } if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) { CRITICAL((SGE_EVENT, SFNMAX, err_str)); SGE_EXIT((void**)&ctx, 1); } if (sge_switch2admin_user()) { CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER)); SGE_EXIT((void**)&ctx, 1); } sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx)); sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx)); sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND); unlink(TMP_ERR_FILE_SHADOWD); log_state_set_log_as_admin_user(1); log_state_set_log_file(shadow_err_file); { int* tmp_fd_array = NULL; unsigned long tmp_fd_count = 0; if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) { sge_daemonize(tmp_fd_array, tmp_fd_count, ctx); if (tmp_fd_array != NULL) { sge_free(&tmp_fd_array); } } else { sge_daemonize(NULL, 0, ctx); } } /* shadowd pid file will contain aliased name */ sge_write_pid(shadowd_pidfile); starting_up(); sge_setup_sig_handlers(SHADOWD); last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); last = (time_t) sge_get_gmt(); /* set time of last check time */ delay = 0; while (!shut_me_down) { sleep(check_interval); /* get current heartbeat file content */ heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30); now = (time_t) sge_get_gmt(); /* Only check when we could read the heartbeat file at least two times * (last_heartbeat and heartbeat) without error */ if (last_heartbeat > 0 && heartbeat > 0) { /* * OK we have to heartbeat entries to check. Check times ... * now = current time * last = last check time */ if ( (now - last) >= (get_active_interval + delay) ) { delay = 0; if (last_heartbeat == heartbeat) { DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last))); delay = delay_time; /* set delay time */ /* * check if we are a possible new qmaster host (lock file of qmaster active, etc.) */ ret = check_if_valid_shadow(binpath, oldqmaster, ctx->get_act_qmaster_file(ctx), ctx->get_shadow_master_file(ctx), ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx)); if (ret == 0) { /* we can start a qmaster on this host */ if (qmaster_lock(QMASTER_LOCK_FILE)) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER)); } else { int out, err; /* still the old qmaster name in act_qmaster file and still the old heartbeat */ latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30); /* TODO: what do we when there is a timeout ??? */ DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n")); if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) && !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && (latest_heartbeat == heartbeat)) { char qmaster_name[256]; strcpy(qmaster_name, SGE_PREFIX); strcat(qmaster_name, prognames[QMASTER]); DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); /* * open logfile as admin user for initial qmaster/schedd * startup messages */ out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 0644); err = out; if (out == -1) { /* * First priority is the master restart * => ignore this error */ out = 1; err = 2; } sge_switch2start_user(); ret = startprog(out, err, NULL, binpath, qmaster_name, NULL); sge_switch2admin_user(); if (ret) { ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER)); } close(out); } else { qmaster_unlock(QMASTER_LOCK_FILE); } } } else { if (ret == -1) { /* just log the more important failures */ WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) )); } } } /* Begin a new interval, set timers and hearbeat to current values */ last = now; last_heartbeat = heartbeat; } } else { if (last_heartbeat < 0 || heartbeat < 0) { /* There was an error reading heartbeat or last_heartbeat */ DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n", sge_u32c(last_heartbeat), sge_u32c(heartbeat))); } else { DPRINTF(("have to read the heartbeat file twice to check time differences\n")); } } } sge_shutdown((void**)&ctx, 0); DRETURN(EXIT_SUCCESS); }
bool sge_parse_qrsub(sge_gdi_ctx_class_t *ctx, lList *pcmdline, lList **alpp, lListElem **ar) { lListElem *ep = NULL, *next_ep = NULL; lList *lp = NULL; DENTER(TOP_LAYER, "sge_parse_qrsub"); /* -help print this help */ if ((ep = lGetElemStr(pcmdline, SPA_switch, "-help"))) { lRemoveElem(pcmdline, &ep); sge_usage(QRSUB, stdout); DEXIT; SGE_EXIT((void **)&ctx, 0); } /* -a date_time start time in [[CC]YY]MMDDhhmm[.SS] SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-a"))) { lSetUlong(*ar, AR_start_time, lGetUlong(ep, SPA_argval_lUlongT)); lRemoveElem(pcmdline, &ep); } /* -e date_time end time in [[CC]YY]MMDDhhmm[.SS] SGE_ULONG*/ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-e"))) { lSetUlong(*ar, AR_end_time, lGetUlong(ep, SPA_argval_lUlongT)); lRemoveElem(pcmdline, &ep); } /* -d time duration in TIME format SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-d"))) { lSetUlong(*ar, AR_duration, lGetUlong(ep, SPA_argval_lUlongT)); lRemoveElem(pcmdline, &ep); } /* -w e/v validate availability of AR request, default e SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-w"))) { lSetUlong(*ar, AR_verify, lGetInt(ep, SPA_argval_lIntT)); lRemoveElem(pcmdline, &ep); } /* -N name AR name SGE_STRING */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-N"))) { lSetString(*ar, AR_name, lGetString(ep, SPA_argval_lStringT)); lRemoveElem(pcmdline, &ep); } /* -A account_string AR name in accounting record SGE_STRING */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-A"))) { lSetString(*ar, AR_account, lGetString(ep, SPA_argval_lStringT)); lRemoveElem(pcmdline, &ep); } /* -l resource_list request the given resources SGE_LIST */ parse_list_simple(pcmdline, "-l", *ar, AR_resource_list, 0, 0, FLG_LIST_APPEND); centry_list_remove_duplicates(lGetList(*ar, AR_resource_list)); /* -u wc_user access list SGE_LIST */ /* -u ! wc_user TBD: Think about eval_expression support in compare allowed and excluded lists */ parse_list_simple(pcmdline, "-u", *ar, AR_acl_list, ARA_name, 0, FLG_LIST_MERGE); /* -u ! list separation */ lp = lGetList(*ar, AR_acl_list); next_ep = lFirst(lp); while ((ep = next_ep)) { bool is_xacl = false; const char *name = lGetString(ep, ARA_name); next_ep = lNext(ep); if (name[0] == '!') { /* move this element to xacl_list */ is_xacl = true; name++; } if (!is_hgroup_name(name)) { struct passwd *pw; struct passwd pw_struct; char *buffer; int size; stringT group; size = get_pw_buffer_size(); buffer = sge_malloc(size); pw = sge_getpwnam_r(name, &pw_struct, buffer, size); if (pw == NULL) { answer_list_add_sprintf(alpp, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR, MSG_USER_XISNOKNOWNUSER_S, name); sge_free(&buffer); DRETURN(false); } sge_gid2group(pw->pw_gid, group, MAX_STRING_SIZE, MAX_NIS_RETRIES); lSetString(ep, ARA_group, group); sge_free(&buffer); } if (is_xacl) { lListElem *new_ep = lAddSubStr(*ar, ARA_name, name, AR_xacl_list, ARA_Type); lSetString(new_ep, ARA_group, lGetString(ep, ARA_group)); lRemoveElem(lp, &ep); } } /* -q wc_queue_list reserve in queue(s) SGE_LIST */ parse_list_simple(pcmdline, "-q", *ar, AR_queue_list, 0, 0, FLG_LIST_APPEND); /* -pe pe_name slot_range reserve slot range for parallel jobs */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-pe"))) { lSetString(*ar, AR_pe, lGetString(ep, SPA_argval_lStringT)); /* SGE_STRING, */ lSwapList(*ar, AR_pe_range, ep, SPA_argval_lListT); /* SGE_LIST */ lRemoveElem(pcmdline, &ep); } /* AR_master_queue_list -masterq wc_queue_list, SGE_LIST bind master task to queue(s) */ parse_list_simple(pcmdline, "-masterq", *ar, AR_master_queue_list, 0, 0, FLG_LIST_APPEND); /* -ckpt ckpt-name reserve in queue with ckpt method SGE_STRING */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-ckpt"))) { lSetString(*ar, AR_checkpoint_name, lGetString(ep, SPA_argval_lStringT)); lRemoveElem(pcmdline, &ep); } /* -m b/e/a/n define mail notification events SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-m"))) { u_long32 ul; u_long32 old_mail_opts; ul = lGetInt(ep, SPA_argval_lIntT); if ((ul & NO_MAIL)) { lSetUlong(*ar, AR_mail_options, 0); } else { old_mail_opts = lGetUlong(*ar, AR_mail_options); lSetUlong(*ar, AR_mail_options, ul | old_mail_opts); } lRemoveElem(pcmdline, &ep); } /* -M user[@host],... notify these e-mail addresses SGE_LIST*/ parse_list_simple(pcmdline, "-M", *ar, AR_mail_list, MR_host, MR_user, FLG_LIST_MERGE); /* -he yes/no hard error handling SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-he"))) { lSetUlong(*ar, AR_error_handling, lGetUlong(ep, SPA_argval_lUlongT)); lRemoveElem(pcmdline, &ep); } /* -now reserve in queues with qtype interactive SGE_ULONG */ while ((ep = lGetElemStr(pcmdline, SPA_switch, "-now"))) { u_long32 ar_now = lGetUlong(*ar, AR_type); if(lGetInt(ep, SPA_argval_lIntT)) { JOB_TYPE_SET_IMMEDIATE(ar_now); } else { JOB_TYPE_CLEAR_IMMEDIATE(ar_now); } lSetUlong(*ar, AR_type, ar_now); lRemoveElem(pcmdline, &ep); } /* Remove the script elements. They are not stored in the ar structure */ if ((ep = lGetElemStr(pcmdline, SPA_switch, STR_PSEUDO_SCRIPT))) { lRemoveElem(pcmdline, &ep); } if ((ep = lGetElemStr(pcmdline, SPA_switch, STR_PSEUDO_SCRIPTLEN))) { lRemoveElem(pcmdline, &ep); } if ((ep = lGetElemStr(pcmdline, SPA_switch, STR_PSEUDO_SCRIPTPTR))) { lRemoveElem(pcmdline, &ep); } ep = lFirst(pcmdline); if(ep) { const char *option = lGetString(ep,SPA_switch); /* as jobarg are stored no switch values, need to be filtered */ if(sge_strnullcmp(option, "jobarg") != 0) { answer_list_add_sprintf(alpp, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR, MSG_PARSE_INVALIDOPTIONARGUMENTX_S, lGetString(ep,SPA_switch)); } else { answer_list_add_sprintf(alpp, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR, MSG_PARSE_INVALIDOPTIONARGUMENT); } DRETURN(false); } if (lGetUlong(*ar, AR_start_time) == 0 && lGetUlong(*ar, AR_end_time) != 0 && lGetUlong(*ar, AR_duration) != 0) { lSetUlong(*ar, AR_start_time, lGetUlong(*ar, AR_end_time) - lGetUlong(*ar, AR_duration)); } else if (lGetUlong(*ar, AR_start_time) != 0 && lGetUlong(*ar, AR_end_time) == 0 && lGetUlong(*ar, AR_duration) != 0) { lSetUlong(*ar, AR_end_time, duration_add_offset(lGetUlong(*ar, AR_start_time), lGetUlong(*ar, AR_duration))); lSetUlong(*ar, AR_duration, lGetUlong(*ar, AR_end_time) - lGetUlong(*ar, AR_start_time)); } else if (lGetUlong(*ar, AR_start_time) != 0 && lGetUlong(*ar, AR_end_time) != 0 && lGetUlong(*ar, AR_duration) == 0) { lSetUlong(*ar, AR_duration, lGetUlong(*ar, AR_end_time) - lGetUlong(*ar, AR_start_time)); } DRETURN(true); }
/* destructor function that will be called when a thread ends */ static void sge_err_destroy(void* state) { sge_free(&state); }
/****** sge_order/sge_create_cull_order_pos() ********************************** * NAME * sge_create_cull_order_pos() -- generates a cull order position struct * * SYNOPSIS * void sge_create_cull_order_pos(order_pos_t **cull_order_pos, lListElem * *jep, lListElem *jatp, lListElem *joker, lListElem *joker_task) * * FUNCTION * generates a cull order position struct * * INPUTS * order_pos_t **cull_order_pos - struct to init. if not NULL, the old struct will be freed * lListElem *jep - job structure * lListElem *jatp - ja task structure * lListElem *joker - job order structure * lListElem *joker_task - ja task order structure * * NOTES * MT-NOTE: sge_create_cull_order_pos() is MT safe * *******************************************************************************/ void sge_create_cull_order_pos(order_pos_t **cull_order_pos, lListElem *jep, lListElem *jatp, lListElem *joker, lListElem *joker_task) { ja_task_pos_t *ja_pos; ja_task_pos_t *order_ja_pos; job_pos_t *job_pos; job_pos_t *order_job_pos; if (*cull_order_pos != NULL) { sge_free(&cull_order_pos); } *cull_order_pos = malloc(sizeof(order_pos_t)); ja_pos = &((*cull_order_pos)->ja_task); order_ja_pos = &((*cull_order_pos)->order_ja_task); job_pos = &((*cull_order_pos)->job); order_job_pos = &((*cull_order_pos)->order_job); if (jep != NULL) { job_pos->JB_version_pos = lGetPosViaElem(jep,JB_version, SGE_NO_ABORT); job_pos->JB_nppri_pos = lGetPosViaElem(jep,JB_nppri, SGE_NO_ABORT); job_pos->JB_nurg_pos = lGetPosViaElem(jep,JB_nurg, SGE_NO_ABORT); job_pos->JB_urg_pos = lGetPosViaElem(jep,JB_urg, SGE_NO_ABORT); job_pos->JB_rrcontr_pos = lGetPosViaElem(jep,JB_rrcontr, SGE_NO_ABORT); job_pos->JB_dlcontr_pos = lGetPosViaElem(jep,JB_dlcontr, SGE_NO_ABORT); job_pos->JB_wtcontr_pos = lGetPosViaElem(jep,JB_wtcontr, SGE_NO_ABORT); /* DPRINTF(("job prio pos: %d %d %d %d %d %d %d\n", job_pos->JB_version_pos, job_pos->JB_nppri_pos, job_pos->JB_nurg_pos, job_pos->JB_urg_pos, job_pos->JB_rrcontr_pos, job_pos->JB_dlcontr_pos, job_pos->JB_wtcontr_pos));# */ } if (jatp != NULL) { ja_pos->JAT_status_pos = lGetPosViaElem(jatp,JAT_status, SGE_NO_ABORT); ja_pos->JAT_tix_pos = lGetPosViaElem(jatp,JAT_tix, SGE_NO_ABORT); ja_pos->JAT_oticket_pos = lGetPosViaElem(jatp,JAT_oticket, SGE_NO_ABORT); ja_pos->JAT_fticket_pos = lGetPosViaElem(jatp,JAT_fticket, SGE_NO_ABORT); ja_pos->JAT_sticket_pos = lGetPosViaElem(jatp,JAT_sticket, SGE_NO_ABORT); ja_pos->JAT_share_pos = lGetPosViaElem(jatp,JAT_share, SGE_NO_ABORT); ja_pos->JAT_prio_pos = lGetPosViaElem(jatp,JAT_prio, SGE_NO_ABORT); ja_pos->JAT_ntix_pos = lGetPosViaElem(jatp,JAT_ntix, SGE_NO_ABORT); /* DPRINTF(("ja task prio pos: %d %d %d %d %d %d %d %d\n", ja_pos->JAT_status_pos, ja_pos->JAT_tix_pos, ja_pos->JAT_oticket_pos, ja_pos->JAT_fticket_pos, ja_pos->JAT_sticket_pos, ja_pos->JAT_share_pos, ja_pos->JAT_prio_pos, ja_pos->JAT_ntix_pos)); */ } if (joker != NULL) { order_job_pos->JB_version_pos = -1; order_job_pos->JB_nppri_pos = lGetPosViaElem(joker,JB_nppri, SGE_NO_ABORT); order_job_pos->JB_nurg_pos = lGetPosViaElem(joker,JB_nurg, SGE_NO_ABORT); order_job_pos->JB_urg_pos = lGetPosViaElem(joker,JB_urg, SGE_NO_ABORT); order_job_pos->JB_rrcontr_pos = lGetPosViaElem(joker,JB_rrcontr, SGE_NO_ABORT); order_job_pos->JB_dlcontr_pos = lGetPosViaElem(joker,JB_dlcontr, SGE_NO_ABORT); order_job_pos->JB_wtcontr_pos = lGetPosViaElem(joker,JB_wtcontr, SGE_NO_ABORT); /* DPRINTF(("job order pos: %d %d %d %d %d %d %d\n", order_job_pos->JB_version_pos, order_job_pos->JB_nppri_pos, order_job_pos->JB_nurg_pos, order_job_pos->JB_urg_pos, order_job_pos->JB_rrcontr_pos, order_job_pos->JB_dlcontr_pos, order_job_pos->JB_wtcontr_pos)); */ } if (joker_task != NULL) { order_ja_pos->JAT_status_pos = -1; order_ja_pos->JAT_tix_pos = -1; order_ja_pos->JAT_oticket_pos = lGetPosViaElem(joker_task,JAT_oticket, SGE_NO_ABORT); order_ja_pos->JAT_fticket_pos = lGetPosViaElem(joker_task,JAT_fticket, SGE_NO_ABORT); order_ja_pos->JAT_sticket_pos = lGetPosViaElem(joker_task,JAT_sticket, SGE_NO_ABORT); order_ja_pos->JAT_share_pos = lGetPosViaElem(joker_task,JAT_share, SGE_NO_ABORT); order_ja_pos->JAT_prio_pos = lGetPosViaElem(joker_task,JAT_prio, SGE_NO_ABORT); order_ja_pos->JAT_ntix_pos = lGetPosViaElem(joker_task,JAT_ntix, SGE_NO_ABORT); /* DPRINTF(("ja task order pos: %d %d %d %d %d %d %d %d\n", order_ja_pos->JAT_status_pos, order_ja_pos->JAT_tix_pos, order_ja_pos->JAT_oticket_pos, order_ja_pos->JAT_fticket_pos, order_ja_pos->JAT_sticket_pos, order_ja_pos->JAT_share_pos, order_ja_pos->JAT_prio_pos, order_ja_pos->JAT_ntix_pos)); */ } }
/****** qmaster/sge_mod_configuration() **************************************** * NAME * sge_mod_configuration() -- modify cluster configuration * * SYNOPSIS * int sge_mod_configuration(lListElem *aConf, lList **anAnswer, char *aUser, * char *aHost) * * FUNCTION * Modify cluster configuration. 'confp' is a pointer to a 'CONF_Type' list * element and does contain the modified configuration entry. Adding a new * configuration entry is also viewed as a modification. * * INPUTS * lListElem *aConf - CONF_Type element containing the modified conf * lList **anAnswer - answer list * char *aUser - target user * char *aHost - target host * * RESULT * int - 0 success * -1 error * * NOTES * MT-NOTE: sge_mod_configuration() is MT safe * *******************************************************************************/ int sge_mod_configuration(sge_gdi_ctx_class_t *ctx, lListElem *aConf, lList **anAnswer, char *aUser, char *aHost) { lListElem *old_conf; const char *tmp_name = NULL; char unique_name[CL_MAXHOSTLEN]; int ret = -1; const char *cell_root = ctx->get_cell_root(ctx); const char *qualified_hostname = ctx->get_qualified_hostname(ctx); u_long32 progid = ctx->get_who(ctx); DENTER(TOP_LAYER, "sge_mod_configuration"); if (!aConf || !aUser || !aHost) { CRITICAL((SGE_EVENT, MSG_SGETEXT_NULLPTRPASSED_S, SGE_FUNC)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((tmp_name = lGetHost(aConf, CONF_name)) == NULL) { CRITICAL((SGE_EVENT, MSG_SGETEXT_MISSINGCULLFIELD_SS, lNm2Str(CONF_name), SGE_FUNC)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((ret = sge_resolve_hostname(tmp_name, unique_name, EH_name, sizeof(unique_name))) != CL_RETVAL_OK) { DPRINTF(("%s: error %s resolving host %s\n", SGE_FUNC, cl_get_error_text(ret), tmp_name)); ERROR((SGE_EVENT, MSG_SGETEXT_CANTRESOLVEHOST_S, tmp_name)); answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR); DRETURN(STATUS_EUNKNOWN); } if ((ret = check_config(anAnswer, aConf))) { DRETURN(ret); } if ((old_conf = sge_get_configuration_for_host(unique_name)) != NULL) { int ret = -1; ret = do_mod_config(ctx, unique_name, old_conf, aConf, anAnswer); lFreeElem(&old_conf); if (ret == 0) { INFO((SGE_EVENT, MSG_SGETEXT_MODIFIEDINLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF)); answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO); } else { DRETURN(STATUS_EUNKNOWN); } } else { do_add_config(ctx, unique_name, aConf, anAnswer); INFO((SGE_EVENT, MSG_SGETEXT_ADDEDTOLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF)); answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO); } if (strcmp(SGE_GLOBAL_NAME, unique_name) == 0) { sge_add_event(0, sgeE_GLOBAL_CONFIG, 0, 0, NULL, NULL, NULL, NULL); } /* ** is the configuration change relevant for the qmaster itsself? ** if so, initialise conf struct anew */ if (strcmp(unique_name, SGE_GLOBAL_NAME) == 0 || sge_hostcmp(unique_name, qualified_hostname) == 0) { lListElem *local = NULL; lListElem *global = NULL; lList *answer_list = NULL; char* qmaster_params = NULL; int accounting_flush_time = mconf_get_accounting_flush_time(); if ((local = sge_get_configuration_for_host(qualified_hostname)) == NULL) { WARNING((SGE_EVENT, MSG_CONF_NOLOCAL_S, qualified_hostname)); } if ((global = sge_get_configuration_for_host(SGE_GLOBAL_NAME)) == NULL) { ERROR((SGE_EVENT, SFNMAX, MSG_CONF_NOGLOBAL)); } if (merge_configuration(&answer_list, progid, cell_root, global, local, NULL) != 0) { ERROR((SGE_EVENT, MSG_CONF_CANTMERGECONFIGURATIONFORHOST_S, qualified_hostname)); } answer_list_output(&answer_list); /* Restart the accounting flush event if needed. */ if ((accounting_flush_time == 0) && (mconf_get_accounting_flush_time() != 0)) { te_event_t ev = te_new_event(time(NULL), TYPE_ACCOUNTING_TRIGGER, ONE_TIME_EVENT, 1, 0, NULL); te_add_event(ev); te_free_event(&ev); } lFreeElem(&local); lFreeElem(&global); sge_show_conf(); /* 'max_unheard' may have changed */ cl_commlib_set_connection_param(cl_com_get_handle("qmaster", 1), HEARD_FROM_TIMEOUT, mconf_get_max_unheard()); /* fetching qmaster_params and begin to parse */ qmaster_params = mconf_get_qmaster_params(); /* updating the commlib paramterlist and gdi_timeout with new or changed parameters */ cl_com_update_parameter_list(qmaster_params); sge_free(&qmaster_params); } /* invalidate configuration cache */ mconf_set_new_config(true); DRETURN(STATUS_OK); }
/****** sge_var/var_list_parse_from_string() ******************************* * NAME * var_list_parse_from_string() -- parse vars from string list * * SYNOPSIS * int var_list_parse_from_string(lList **lpp, * const char *variable_str, * int check_environment) * * FUNCTION * Parse a list of variables ("lpp") from a comma separated * string list ("variable_str"). The boolean "check_environment" * defined wether the current value of a variable is taken from * the environment of the calling process. * * INPUTS * lList **lpp - VA_Type list * const char *variable_str - source string * int check_environment - boolean * * RESULT * int - error state * 0 - OK * >0 - Error * * NOTES * MT-NOTE: var_list_parse_from_string() is MT safe *******************************************************************************/ int var_list_parse_from_string(lList **lpp, const char *variable_str, int check_environment) { char *variable; char *val_str; int var_len; char **str_str; char **pstr; lListElem *ep; char *va_string; DENTER(TOP_LAYER, "var_list_parse_from_string"); if (!lpp) { DEXIT; return 1; } va_string = sge_strdup(NULL, variable_str); if (!va_string) { *lpp = NULL; DEXIT; return 2; } str_str = string_list(va_string, ",", NULL); if (!str_str || !*str_str) { *lpp = NULL; sge_free(&va_string); DEXIT; return 3; } if (!*lpp) { *lpp = lCreateList("variable list", VA_Type); if (!*lpp) { sge_free(&va_string); sge_free(&str_str); DEXIT; return 4; } } for (pstr = str_str; *pstr; pstr++) { struct saved_vars_s *context; ep = lCreateElem(VA_Type); /* SGE_ASSERT(ep); */ lAppendElem(*lpp, ep); context = NULL; variable = sge_strtok_r(*pstr, "=", &context); SGE_ASSERT((variable)); var_len=strlen(variable); lSetString(ep, VA_variable, variable); val_str=*pstr; /* * The character at the end of the first token must be either '=' or '\0'. * If it's a '=' then we treat the following string as the value * If it's a '\0' and check_environment is set, then we get the value from * the environment variable value. * If it's a '\0' and check_environment is not set, then we set the value * to NULL. */ if (val_str[var_len] == '=') { lSetString(ep, VA_value, &val_str[var_len+1]); } else if (check_environment) { lSetString(ep, VA_value, sge_getenv(variable)); } else { lSetString(ep, VA_value, NULL); } sge_free_saved_vars(context); } sge_free(&va_string); sge_free(&str_str); DRETURN(0); }
int main(int argc,char *argv[]) { struct hostent *he = NULL; char* resolved_name = NULL; int retval = CL_RETVAL_OK; char **tp,**tp2; int name_only = 0; int sge_aliasing = 0; int all_option = 0; int system_error = 0; if (argc < 1 ) { usage(); } if (argc >= 2) { if (!strcmp(argv[1], "-help")) { usage(); } if (!strcmp(argv[1], "-name")) { if (argc != 2) { usage(); } name_only = 1; } if (!strcmp(argv[1], "-aname")) { if (argc != 2) { usage(); } name_only = 1; sge_aliasing = 1; } if (!strcmp(argv[1], "-all")) { if (argc != 2) { usage(); } name_only = 0; sge_aliasing = 1; all_option = 1; } } if (name_only == 0 && argc != 1 && all_option == 0) { usage(); } retval = cl_com_setup_commlib(CL_NO_THREAD ,CL_LOG_OFF, NULL); if (retval != CL_RETVAL_OK) { fprintf(stderr,"%s\n",cl_get_error_text(retval)); exit(1); } if (sge_aliasing ) { const char *alias_path = sge_get_alias_path(); cl_com_set_alias_file(alias_path); sge_free(&alias_path); } retval = cl_com_gethostname(&resolved_name, NULL, &he, &system_error); if (retval != CL_RETVAL_OK) { char* err_text = cl_com_get_h_error_string(system_error); if (err_text == NULL) { err_text = strdup(strerror(system_error)); if (err_text == NULL) { err_text = strdup("unexpected error"); } } fprintf(stderr,"error resolving local host: %s (%s)\n",cl_get_error_text(retval), err_text); sge_free(&err_text); cl_com_cleanup_commlib(); exit(1); } if (name_only) { if (sge_aliasing) { if (resolved_name != NULL) { printf("%s\n",resolved_name); } else { printf("%s\n","unexpected error"); } } else { if (he != NULL) { printf("%s\n",he->h_name); } else { printf("%s\n","could not get hostent struct"); } } } else { if (he != NULL) { printf(MSG_SYSTEM_HOSTNAMEIS_S , he->h_name); printf("\n"); if (resolved_name != NULL && all_option) { printf("SGE name: %s\n",resolved_name); } printf("%s", MSG_SYSTEM_ALIASES); for (tp = he->h_aliases; *tp; tp++) { printf("%s ", *tp); } printf("\n"); printf("%s", MSG_SYSTEM_ADDRESSES); for (tp2 = he->h_addr_list; *tp2; tp2++) { printf("%s ", inet_ntoa(* (struct in_addr *) *tp2)); /* inet_ntoa() is not MT save */ } printf("\n"); } else { fprintf(stderr,"%s\n","could not get hostent struct"); } } sge_free(&resolved_name); sge_free_hostent(&he); retval = cl_com_cleanup_commlib(); if (retval != CL_RETVAL_OK) { fprintf(stderr,"%s\n",cl_get_error_text(retval)); exit(1); } return 0; }
/****** sge_binding/get_striding_first_socket_first_core_and_account() ******** * NAME * get_striding_first_socket_first_core_and_account() -- Checks if and where * striding would fit. * * SYNOPSIS * bool getStridingFirstSocketFirstCore(const int amount, const int * stepsize, int* first_socket, int* first_core) * * FUNCTION * This operating system independent function checks (depending on * the underlaying topology string and the topology string which * reflects already execution units in use) if it is possible to * bind the job in a striding manner to cores on the host. * * This function requires the topology string and the string with the * topology currently in use. * * INPUTS * const int amount - Amount of cores to allocate. * const int stepsize - Distance of the cores to allocate. * const int start_at_socket - First socket to begin the search with (usually at 0). * const int start_at_core - First core to begin the search with (usually at 0). * int* first_socket - out: First socket when striding is possible (return value). * int* first_core - out: First core when striding is possible (return value). * * RESULT * bool - if true striding is possible at <first_socket, first_core> * * NOTES * MT-NOTE: getStridingFirstSocketFirstCore() is not MT safe * * SEE ALSO * ???/??? *******************************************************************************/ bool get_striding_first_socket_first_core_and_account(const int amount, const int stepsize, const int start_at_socket, const int start_at_core, const bool automatic, int* first_socket, int* first_core, char** accounted_topology, int* accounted_topology_length) { /* return value: if it is possible to fit the request on the host */ bool possible = false; /* position in topology string */ int i = 0; /* socket and core counter in order to find the first core and socket */ int sc = -1; int cc = -1; /* these core and socket counters are added later on .. */ int found_cores = 0; int found_sockets = 0; /* first socket is given implicitely */ /* temp topology string where accounting is done on */ char* tmp_topo_busy; /* initialize socket and core where the striding will fit */ *first_socket = 0; *first_core = 0; if (start_at_socket < 0 || start_at_core < 0) { /* wrong input parameter */ return false; } if (logical_used_topology == NULL) { /* we have no topology string at the moment (should be initialized before) */ if (!get_topology(&logical_used_topology, &logical_used_topology_length)) { /* couldn't even get the topology string */ return false; } } /* temporary accounting string -> account on this and when eventually successful then copy this string back to global topo_busy string */ tmp_topo_busy = (char *) calloc(logical_used_topology_length + 1, sizeof(char)); memcpy(tmp_topo_busy, logical_used_topology, logical_used_topology_length*sizeof(char)); /* we have to go to the first position given by the arguments (start_at_socket and start_at_core) */ for (i = 0; i < logical_used_topology_length; i++) { if (logical_used_topology[i] == 'C' || logical_used_topology[i] == 'c') { /* found core -> update core counter */ cc++; } else if (logical_used_topology[i] == 'S' || logical_used_topology[i] == 's') { /* found socket -> update socket counter */ sc++; /* we're changing socket -> no core found on this one yet */ cc = -1; } else if (logical_used_topology[i] == '\0') { /* we couldn't find start socket start string */ possible = false; sge_free(&tmp_topo_busy); return possible; } if (sc == start_at_socket && cc == start_at_core) { /* we found our starting point (we remember 'i' for next loop!) */ break; } } /* check if we found the socket and core we want to start searching */ if (sc != start_at_socket || cc != start_at_core) { /* could't find the start socket and start core */ sge_free(&tmp_topo_busy); return false; } /* check each position of the topology string */ /* we reuse 'i' from last loop -> this is the position where we begin */ for (; i < logical_used_topology_length && logical_used_topology[i] != '\0'; i++) { /* this could be optimized (with increasing i in case if it is not possible) */ if (is_starting_point(logical_used_topology, logical_used_topology_length, i, amount, stepsize, &tmp_topo_busy)) { /* we can do striding with this as starting point */ possible = true; /* update place where we can begin */ *first_socket = start_at_socket + found_sockets; *first_core = start_at_core + found_cores; /* return the accounted topology */ create_topology_used_per_job(accounted_topology, accounted_topology_length, logical_used_topology, tmp_topo_busy, logical_used_topology_length); /* finally do execution host wide accounting */ /* DG TODO mutex */ memcpy(logical_used_topology, tmp_topo_busy, logical_used_topology_length*sizeof(char)); break; } else { /* else retry and update socket and core number to start with */ if (logical_used_topology[i] == 'C' || logical_used_topology[i] == 'c') { /* jumping over a core */ found_cores++; /* a core is a valid starting point for binding in non-automatic case */ /* if we have a fixed start socket and a start core we do not retry it with the next core available (when introducing T's this have to be added there too) */ if (automatic == false) { possible = false; break; } } else if (logical_used_topology[i] == 'S' || logical_used_topology[i] == 's') { /* jumping over a socket */ found_sockets++; /* we are at core 0 on the new socket */ found_cores = 0; } /* at the moment we are not interested in threads or anything else */ } } /* end go through the whole topology string */ sge_free(&tmp_topo_busy); return possible; }
/****** cull/db/lJoin() ******************************************************* * NAME * lJoin() -- Joins two lists together * * SYNOPSIS * lList* lJoin(const char *name, int nm0, const lList *lp0, * const lCondition *cp0, const lEnumeration *enp0, * int nm1, const lList *lp1, const lCondition *cp1, * const lEnumeration *enp1) * * FUNCTION * Returns a new list joining together the lists 'lp0' and 'lp1' * For the join only these 'lines' described in condition 'cp0' * and 'cp1' are used. * The new list gets only these members described in 'enp0' and * 'enp1'. NULL means every member of this list. * The list gets 'name' as listname. * * INPUTS * const char *name - name of new list * int nm0 - * const lList *lp0 - first list * const lCondition *cp0 - selects rows of first list * const lEnumeration *enp0 - selects column of first list * int nm1 - * const lList *lp1 - second list * const lCondition *cp1 - selects rows of second list * const lEnumeration *enp1 - selects column of seconf list * * RESULT * lList* - Joined list ******************************************************************************/ lList *lJoin(const char *name, int nm0, const lList *lp0, const lCondition *cp0, const lEnumeration *enp0, int nm1, const lList *lp1, const lCondition *cp1, const lEnumeration *enp1) { lListElem *ep0, *ep1; lListElem *ep; lList *dlp = NULL; lDescr *dp; int lp0_pos = 0, lp1_pos = 0; int i, j; int needed; DENTER(CULL_LAYER, "lJoin"); if (!lp0 || !lp1 || !name || !enp0 || !enp1) { LERROR(LENULLARGS); DEXIT; return NULL; } if (nm1 != NoName) { if ((lp0_pos = lGetPosInDescr(lGetListDescr(lp0), nm0)) < 0) { LERROR(LENAMENOT); DEXIT; return NULL; } if ((lp1_pos = lGetPosInDescr(lGetListDescr(lp1), nm1)) < 0) { LERROR(LENAMENOT); DEXIT; return NULL; } if (mt_get_type(lp0->descr[lp0_pos].mt) != mt_get_type(lp1->descr[lp1_pos].mt) || mt_get_type(lp0->descr[lp0_pos].mt) == lListT) { LERROR(LEDIFFDESCR); DEXIT; return NULL; } } /* the real join ?! */ if (!(dp = lJoinDescr(lGetListDescr(lp0), lGetListDescr(lp1), enp0, enp1))) { LERROR(LEJOINDESCR); DEXIT; return NULL; } if (!(dlp = lCreateList(name, dp))) { LERROR(LECREATELIST); sge_free(&dp); DEXIT; return NULL; } /* free dp it has been copied by lCreateList */ sge_free(&dp); for (i = 0, ep0 = lp0->first; i < lp0->nelem; i++, ep0 = ep0->next) { if (!lCompare(ep0, cp0)) continue; for (j = 0, ep1 = lp1->first; j < lp1->nelem; j++, ep1 = ep1->next) { if (!lCompare(ep1, cp1)) continue; if (nm1 != NoName) { /* in this case take it always */ /* This is a comparison of the join fields nm0 , nm1 */ switch (mt_get_type(lp0->descr[lp0_pos].mt)) { case lIntT: needed = (ep0->cont[lp0_pos].i == ep1->cont[lp1_pos].i); break; case lUlongT: needed = (ep0->cont[lp0_pos].ul == ep1->cont[lp1_pos].ul); break; case lStringT: needed = !strcmp(ep0->cont[lp0_pos].str, ep1->cont[lp1_pos].str); break; case lHostT: needed = !strcmp(ep0->cont[lp0_pos].str, ep1->cont[lp1_pos].str); break; case lLongT: needed = (ep0->cont[lp0_pos].l == ep1->cont[lp1_pos].l); break; case lFloatT: needed = (ep0->cont[lp0_pos].fl == ep1->cont[lp1_pos].fl); break; case lDoubleT: needed = (ep0->cont[lp0_pos].db == ep1->cont[lp1_pos].db); break; case lCharT: needed = (ep0->cont[lp0_pos].c == ep1->cont[lp1_pos].c); break; case lBoolT: needed = (ep0->cont[lp0_pos].b == ep1->cont[lp1_pos].b); break; case lRefT: needed = (ep0->cont[lp0_pos].ref == ep1->cont[lp1_pos].ref); break; default: unknownType("lJoin"); DEXIT; return NULL; } if (!needed) continue; } if (!(ep = lJoinCopyElem(dlp->descr, ep0, enp0, ep1, enp1))) { LERROR(LEJOINCOPYELEM); lFreeList(&dlp); DEXIT; return NULL; } else { if (lAppendElem(dlp, ep) == -1) { LERROR(LEAPPENDELEM); lFreeList(&dlp); DEXIT; return NULL; } } } } /* RETURN AN EMPTY LIST OR NULL THAT'S THE QUESTION */ if (lGetNumberOfElem(dlp) == 0) { lFreeList(&dlp); } DEXIT; return dlp; }
/****** binding_support/get_topology() *********************************** * NAME * get_topology() -- Creates the topology string for the current host. * * SYNOPSIS * bool get_topology(char** topology, int* length) * * FUNCTION * Creates the topology string for the current host. When created, * it has to be freed from outside. * * INPUTS * char** topology - The topology string for the current host. * int* length - The length of the topology string. * * RESULT * bool - when true the topology string could be generated (and memory * is allocated otherwise false * * NOTES * MT-NOTE: get_topology() is MT safe * *******************************************************************************/ bool get_topology(char** topology, int* length) { bool success = false; if (HAVE_HWLOC) { /* initialize length of topology string */ (*length) = 0; /* check if topology is supported via hwloc */ if (has_topology_information()) { int num_sockets; /* topology string */ dstring d_topology = DSTRING_INIT; /* build the topology string */ if ((num_sockets = get_number_of_sockets())) { int num_cores, ctr_cores, ctr_sockets, ctr_threads; char* s = "S"; /* socket */ char* c = "C"; /* core */ char* t = "T"; /* thread */ for (ctr_sockets = 0; ctr_sockets < num_sockets; ctr_sockets++) { /* append new socket */ sge_dstring_append_char(&d_topology, *s); (*length)++; /* for each socket get the number of cores */ if ((num_cores = get_number_of_cores(ctr_sockets))) { /* for thread counting */ int* proc_ids = NULL; int number_of_threads = 0; /* check each core */ for (ctr_cores = 0; ctr_cores < num_cores; ctr_cores++) { sge_dstring_append_char(&d_topology, *c); (*length)++; /* check if the core has threads */ if (get_processor_ids(ctr_sockets, ctr_cores, &proc_ids, &number_of_threads) && number_of_threads > 1) { /* print the threads */ for (ctr_threads = 0; ctr_threads < number_of_threads; ctr_threads++) { sge_dstring_append_char(&d_topology, *t); (*length)++; } } sge_free(&proc_ids); } } } /* for each socket */ if ((*length) != 0) { /* convert d_topolgy into topology */ (*length)++; /* we need `\0` at the end */ /* copy element */ (*topology) = sge_strdup(NULL, sge_dstring_get_string(&d_topology)); success = true; } sge_dstring_free(&d_topology); } } } return success; }
/****** shepherd_binding/binding_set_linear_linux() *************************************** * NAME * binding_set_linear_linux() -- Bind current process linear to chunk of cores. * * SYNOPSIS * bool binding_set_linear(int first_socket, int first_core, int * amount_of_cores, int offset) * * FUNCTION * Binds current process (shepherd) to a set of cores. All processes * started by the current process are inheriting the core binding (Linux). * * The core binding is done in a linear manner, that means that * the process is bound to 'amount_of_cores' cores using one core * after another starting at socket 'first_socket' (usually 0) and * core = 'first_core' (usually 0) + 'offset'. If the core number * is higher than the number of cores which are provided by socket * 'first_socket' then the next socket is taken (the core number * defines how many cores are skiped). * * INPUTS * int first_socket - The first socket (starting at 0) to bind to. * int first_core - The first core to bind. * int amount_of_cores - The amount of cores to bind to. * int offset - The user specified core number offset. * binding_type_t type - The type of binding ONLY FOR EXECD ( set | env | pe ) * * RESULT * bool - true if binding for current process was done, false if not * * NOTES * MT-NOTE: binding_set_linear() is not MT safe * *******************************************************************************/ static bool binding_set_linear_linux(int first_socket, int first_core, int amount_of_cores, int offset, const binding_type_t type) { /* sets bitmask in a linear manner */ /* first core is on exclusive host 0 */ /* first core could be set from scheduler */ /* offset is the first core to start with (make sense only with exclusive host) */ dstring error = DSTRING_INIT; if (_has_core_binding(&error) == true) { sge_dstring_clear(&error); /* bitmask for processors to turn on and off */ plpa_cpu_set_t cpuset; /* turn off all processors */ PLPA_CPU_ZERO(&cpuset); sge_dstring_free(&error); if (_has_topology_information()) { /* amount of cores set in processor binding mask */ int cores_set; /* next socket to use */ int next_socket = first_socket; /* the amount of cores of the next socket */ int socket_amount_of_cores; /* next core to use */ int next_core = first_core + offset; /* all the processor ids selected for the mask */ int* proc_id = NULL; /* size of proc_id array */ int proc_id_size = 0; /* maximal amount of sockets on this system */ int max_amount_of_sockets = get_amount_of_plpa_sockets(); /* strategy: go to the first_socket and the first_core + offset and fill up socket and go to the next one. */ /* TODO maybe better to search for using a core exclusively? */ while (get_amount_of_plpa_cores(next_socket) <= next_core) { /* TODO which kind of warning when first socket does not offer this? */ /* move on to next socket - could be that we have to deal only with cores instead of <socket><core> tuples */ next_core -= get_amount_of_plpa_cores(next_socket); next_socket++; if (next_socket >= max_amount_of_sockets) { /* we are out of sockets - we do nothing */ return false; } } add_proc_ids_linux(next_socket, next_core, &proc_id, &proc_id_size); /* collect the other processor ids with the strategy */ for (cores_set = 1; cores_set < amount_of_cores; cores_set++) { next_core++; /* jump to next socket when it is needed */ /* maybe the next socket could offer 0 cores (I can' see when, but just to be sure) */ while ((socket_amount_of_cores = get_amount_of_plpa_cores(next_socket)) <= next_core) { next_socket++; next_core = next_core - socket_amount_of_cores; if (next_socket >= max_amount_of_sockets) { /* we are out of sockets - we do nothing */ sge_free(&proc_id); return false; } } /* get processor ids */ add_proc_ids_linux(next_socket, next_core, &proc_id, &proc_id_size); } /* set the mask for all processor ids */ set_processor_binding_mask(&cpuset, proc_id, proc_id_size); /* check what to do with the processor ids (set, env or pe) */ if (type == BINDING_TYPE_PE) { /* is done outside */ } else if (type == BINDING_TYPE_ENV) { /* set the environment variable */ /* this does not show up in "environment" file !!! */ if (create_binding_env_linux(proc_id, proc_id_size) == true) { shepherd_trace("binding_set_linear_linux: SGE_BINDING env var created"); } else { shepherd_trace("binding_set_linear_linux: problems while creating SGE_BINDING env"); } } else { /* bind SET process to mask */ if (bind_process_to_mask((pid_t) 0, cpuset) == false) { /* there was an error while binding */ sge_free(&proc_id); return false; } } sge_free(&proc_id); } else { /* TODO DG strategy without topology information but with working library? */ shepherd_trace("binding_set_linear_linux: no information about topology"); return false; } } else { shepherd_trace("binding_set_linear_linux: PLPA binding not supported: %s", sge_dstring_get_string(&error)); sge_dstring_free(&error); } return true; }
/****** sge_binding/binding_explicit_check_and_account() *********************** * NAME * binding_explicit_check_and_account() -- Checks if a job can be bound. * * SYNOPSIS * bool binding_explicit_check_and_account(const int* list_of_sockets, const * int samount, const int** list_of_cores, const int score, char** * topo_used_by_job, int* topo_used_by_job_length) * * FUNCTION * Checks if the job can bind to the given by the <socket>,<core> pairs. * If so these cores are marked as used and true is returned. Also an * topology string is returned where all cores consumed by the job are * marked with smaller case letters. * * INPUTS * const int* list_of_sockets - List of sockets to be used * const int samount - Size of list_of_sockets * const int** list_of_cores - List of cores (on sockets) to be used * const int score - Size of list_of_cores * * OUTPUTS * char** topo_used_by_job - Topology with resources job consumes marked. * int* topo_used_by_job_length - Topology string length. * * RESULT * bool - True if the job can be bound to the topology, false if not. * * NOTES * MT-NOTE: binding_explicit_check_and_account() is MT safe * * SEE ALSO * ???/??? *******************************************************************************/ bool binding_explicit_check_and_account(const int* list_of_sockets, const int samount, const int* list_of_cores, const int score, char** topo_used_by_job, int* topo_used_by_job_length) { int i; /* position of <socket>,<core> in topology string */ int pos; /* status if accounting was possible */ bool possible = true; /* input parameter validation */ if (samount != score || samount <= 0 || list_of_sockets == NULL || list_of_cores == NULL) { return false; } /* check if the topology which is used already is accessable */ if (logical_used_topology == NULL) { /* we have no topology string at the moment (should be initialized before) */ if (!get_topology(&logical_used_topology, &logical_used_topology_length)) { /* couldn't even get the topology string */ return false; } } /* create output string */ get_topology(topo_used_by_job, topo_used_by_job_length); /* go through the <socket>,<core> pair list */ for (i = 0; i < samount; i++) { /* get position in topology string */ if ((pos = get_position_in_topology(list_of_sockets[i], list_of_cores[i], logical_used_topology, logical_used_topology_length)) < 0) { /* the <socket>,<core> does not exist */ possible = false; break; } /* check if this core is available (DG TODO introduce threads) */ if (logical_used_topology[pos] == 'C') { /* do temporarily account it */ (*topo_used_by_job)[pos] = 'c'; /* thread binding: account threads here */ account_all_threads_after_core(topo_used_by_job, pos); } else { /* core not usable -> early abort */ possible = false; break; } } /* do accounting if all cores can be used */ if (possible) { if (account_job_on_topology(&logical_used_topology, logical_used_topology_length, *topo_used_by_job, *topo_used_by_job_length) == false) { possible = false; } } /* free memory when unsuccessful */ if (possible == false) { sge_free(topo_used_by_job); *topo_used_by_job_length = 0; } return possible; }
/****** shepherd_binding/binding_explicit() ***************************************** * NAME * binding_explicit() -- Binds current process to specified CPU cores. * * SYNOPSIS * bool binding_explicit(int* list_of_cores, int camount, int* * list_of_sockets, int samount) * * FUNCTION * Binds the current process to the cores specified by a <socket>,<core> * tuple. The tuple is given by a list of sockets and a list of cores. * The elements on the same position of these lists are reflecting * a tuple. Therefore the length of the lists must be the same. * * Binding is currently done on Linux hosts only where the machine topology * can be retrieved with PLPA library. It also does require this library. * * INPUTS * int* list_of_sockets - List of sockets in the same order as list of cores. * int samount - Length of the list of sockets. * int* list_of_cores - List of cores in the same order as list of sockets. * int camount - Length of the list of cores. * int type - Type of binding ( set | env | pe ). * * RESULT * bool - true when the current process was bound like specified with the * input parameter * * NOTES * MT-NOTE: binding_explicit() is not MT safe * *******************************************************************************/ static bool binding_explicit(const int* list_of_sockets, const int samount, const int* list_of_cores, const int camount, const binding_type_t type) { /* return value: successful bound or not */ bool bound = false; /* check if we have exactly the same amount of sockets as cores */ if (camount != samount) { shepherd_trace("binding_explicit: bug: amount of sockets != amount of cores"); return false; } if (list_of_sockets == NULL || list_of_cores == NULL) { shepherd_trace("binding_explicit: wrong input values"); } /* do only on linux when we have core binding feature in kernel */ if (has_core_binding() == true) { if (_has_topology_information()) { /* bitmask for processors to turn on and off */ plpa_cpu_set_t cpuset; /* turn off all processors */ PLPA_CPU_ZERO(&cpuset); /* the internal processor ids selected for the binding mask */ int* proc_id = NULL; int proc_id_size = 0; /* processor id counter */ int pr_id_ctr; /* Fetch for each socket,core tuple the processor id. If this is not possible for one do nothing and return false. */ /* go through all socket,core tuples and get the processor id */ for (pr_id_ctr = 0; pr_id_ctr < camount; pr_id_ctr++) { /* get the processor id */ /* get the OS internal processor ids */ if (add_proc_ids_linux(list_of_sockets[pr_id_ctr], list_of_cores[pr_id_ctr], &proc_id, &proc_id_size) != true) { sge_free(&proc_id); return false; } } /* generate the core binding mask out of the processor id array */ set_processor_binding_mask(&cpuset, proc_id, proc_id_size); if (type == BINDING_TYPE_PE) { /* rankfile is created */ } else if (type == BINDING_TYPE_ENV) { /* set the environment variable */ if (create_binding_env_linux(proc_id, proc_id_size) == true) { shepherd_trace("binding_explicit: SGE_BINDING env var created"); } else { shepherd_trace("binding_explicit: problems while creating SGE_BINDING env"); } } else { /* do the core binding for the current process with the mask */ if (bind_process_to_mask((pid_t) 0, cpuset) == true) { /* there was an error while binding */ bound = true; } else { /* couldn't be bound return false */ shepherd_trace("binding_explicit: bind_process_to_mask was not successful"); } } sge_free(&proc_id); } else { /* has no topology information */ shepherd_trace("binding_explicit: Linux does not offer topology information"); } } else { /* has no core binding ability */ shepherd_trace("binding_explicit: host does not support core binding"); } return bound; }
bool get_linear_automatic_socket_core_list_and_account(const int amount, int** list_of_sockets, int* samount, int** list_of_cores, int* camount, char** topo_by_job, int* topo_by_job_length) { /* return value: if it is possible to fit the request on the host */ bool possible = true; /* temp topology string where accounting is done on */ char* tmp_topo_busy = NULL; /* number of cores we could account already */ int used_cores = 0; /* the numbers of the sockets which are completely free */ int* sockets = NULL; int sockets_size = 0; /* tmp counter */ int i; /* get the topology which could be used by the job */ tmp_topo_busy = (char *) calloc(logical_used_topology_length, sizeof(char)); memcpy(tmp_topo_busy, logical_used_topology, logical_used_topology_length*sizeof(char)); /* 1. Find all free sockets and try to fit the request on them */ if (get_free_sockets(tmp_topo_busy, logical_used_topology_length, &sockets, &sockets_size) == true) { /* there are free sockets: use them */ for (i = 0; i < sockets_size && used_cores < amount; i++) { int needed_cores = amount - used_cores; used_cores += account_cores_on_socket(&tmp_topo_busy, logical_used_topology_length, sockets[i], needed_cores, list_of_sockets, samount, list_of_cores, camount); } sge_free(&sockets); } /* 2. If not all cores fit there - fill up the rest of the sockets */ if (used_cores < amount) { /* the socket which offers some cores */ int socket_free = 0; /* the number of cores we still need */ int needed_cores = amount - used_cores; while (needed_cores > 0) { /* get the socket with the most free cores */ if (get_socket_with_most_free_cores(tmp_topo_busy, logical_used_topology_length, &socket_free) == true) { int accounted_cores = account_cores_on_socket(&tmp_topo_busy, logical_used_topology_length, socket_free, needed_cores, list_of_sockets, samount, list_of_cores, camount); if (accounted_cores < 1) { /* there must be a bug in one of the last two functions! */ possible = false; break; } needed_cores -= accounted_cores; } else { /* we don't have free cores anymore */ possible = false; break; } } } if (possible == true) { /* calculate the topology used by the job out of */ create_topology_used_per_job(topo_by_job, topo_by_job_length, logical_used_topology, tmp_topo_busy, logical_used_topology_length); /* make the temporary accounting permanent */ memcpy(logical_used_topology, tmp_topo_busy, logical_used_topology_length*sizeof(char)); } sge_free(&tmp_topo_busy); return possible; }
/****** uti/log/log_buffer_destroy() **************************************** * NAME * log_buffer_destroy() -- Free thread local storage * * SYNOPSIS * static void log_buffer_destroy(void* theState) * * FUNCTION * Free thread local storage. * * INPUTS * void* theState - Pointer to memroy which should be freed. * * RESULT * static void - none * * NOTES * MT-NOTE: log_buffer_destroy() is MT safe. * *******************************************************************************/ static void log_buffer_destroy(void* theBuffer) { sge_free((char*)theBuffer); }
/****** tty_to_commlib() ******************************************************* * NAME * tty_to_commlib() -- tty_to_commlib thread entry point and main loop * * SYNOPSIS * void* tty_to_commlib(void *t_conf) * * FUNCTION * Entry point and main loop of the tty_to_commlib thread. * Reads data from the tty and writes it to the commlib. * * INPUTS * void *t_conf - pointer to cl_thread_settings_t struct of the thread * * RESULT * void* - always NULL * * NOTES * MT-NOTE: tty_to_commlib is MT-safe ? * * SEE ALSO *******************************************************************************/ void* tty_to_commlib(void *t_conf) { char *pbuf; fd_set read_fds; struct timeval timeout; dstring err_msg = DSTRING_INIT; dstring dbuf = DSTRING_INIT; int do_exit = 0; int ret, nread = 0; DENTER(TOP_LAYER, "tty_to_commlib"); thread_func_startup(t_conf); /* * allocate working buffer */ pbuf = (char*)malloc(BUFSIZE); if (pbuf == NULL) { DPRINTF(("tty_to_commlib can't allocate working buffer: %s (%d)\n", strerror(errno), errno)); do_exit = 1; } while (do_exit == 0) { FD_ZERO(&read_fds); if (g_nostdin == 0) { /* wait for input on tty */ FD_SET(STDIN_FILENO, &read_fds); } timeout.tv_sec = 1; timeout.tv_usec = 0; if (received_signal == SIGCONT) { received_signal = 0; if (continue_handler (g_comm_handle, g_hostname) == 1) { do_exit = 1; continue; } if (g_raw_mode_state == 1) { /* restore raw-mode after SIGCONT */ if (terminal_enter_raw_mode () != 0) { DPRINTF(("tty_to_commlib: couldn't enter raw mode for pty\n")); do_exit = 1; continue; } } } DPRINTF(("tty_to_commlib: Waiting in select() for data\n")); ret = select(STDIN_FILENO+1, &read_fds, NULL, NULL, &timeout); thread_testcancel(t_conf); client_check_window_change(g_comm_handle); if (received_signal == SIGHUP || received_signal == SIGINT || received_signal == SIGQUIT || received_signal == SIGTERM) { /* If we receive one of these signals, we must terminate */ do_exit = 1; continue; } if (ret > 0) { if (g_nostdin == 1) { /* We should never get here if STDIN is closed */ DPRINTF(("tty_to_commlib: STDIN ready to read while it should be closed!!!\n")); } DPRINTF(("tty_to_commlib: trying to read() from stdin\n")); nread = read(STDIN_FILENO, pbuf, BUFSIZE-1); pbuf[nread] = '\0'; sge_dstring_append (&dbuf, pbuf); DPRINTF(("tty_to_commlib: nread = %d\n", nread)); if (nread < 0 && (errno == EINTR || errno == EAGAIN)) { DPRINTF(("tty_to_commlib: EINTR or EAGAIN\n")); /* do nothing */ } else if (nread <= 0) { do_exit = 1; } else { DPRINTF(("tty_to_commlib: writing to commlib: %d bytes\n", nread)); if (suspend_handler(g_comm_handle, g_hostname, g_is_rsh, g_suspend_remote, g_pid, &dbuf) == 1) { if (comm_write_message(g_comm_handle, g_hostname, COMM_CLIENT, 1, (unsigned char*)pbuf, (unsigned long)nread, STDIN_DATA_MSG, &err_msg) != nread) { DPRINTF(("tty_to_commlib: couldn't write all data\n")); } else { DPRINTF(("tty_to_commlib: data successfully written\n")); } } comm_flush_write_messages(g_comm_handle, &err_msg); } } else { /* * We got either a select timeout or a select error. In both cases, * it's a good chance to check if our client is still alive. */ DPRINTF(("tty_to_commlib: Checking if client is still alive\n")); if (comm_get_connection_count(g_comm_handle, &err_msg) == 0) { DPRINTF(("tty_to_commlib: Client is not alive! -> exiting.\n")); do_exit = 1; } else { DPRINTF(("tty_to_commlib: Client is still alive\n")); } } } /* while (do_exit == 0) */ /* Send STDIN_CLOSE_MSG to the shepherd. That causes the shepherd to close its filedescriptor, also. */ if (comm_write_message(g_comm_handle, g_hostname, COMM_CLIENT, 1, (unsigned char*)" ", 1, STDIN_CLOSE_MSG, &err_msg) != 1) { DPRINTF(("tty_to_commlib: couldn't write STDIN_CLOSE_MSG\n")); } else { DPRINTF(("tty_to_commlib: STDIN_CLOSE_MSG successfully written\n")); } /* clean up */ sge_dstring_free(&dbuf); sge_free(&pbuf); thread_func_cleanup(t_conf); sge_dstring_free(&err_msg); DPRINTF(("tty_to_commlib: exiting tty_to_commlib thread!\n")); DEXIT; return NULL; }
/****** uti/io/sge_bin2string() *********************************************** * NAME * sge_bin2string() -- Put binary stream into a string * * SYNOPSIS * char* sge_bin2string(FILE *fp, int size) * * FUNCTION * Read a binary steam from given file descriptor 'fp' and * write it into (dynamically) malloced buffer as "ASCII" format. * * "ASCII" format means: * '\0' is written as '\\' '\0' * '\\' is written as '\\' '\\' * End of buffer is written as '\0' * * INPUTS * FILE *fp - file descriptor * int size - size of the buffer used within this function * * RESULT * char* - malloced buffer * * SEE ALSO * uti/io/sge_string2bin() * * NOTES * MT-NOTE: sge_bin2string() is MT safe ******************************************************************************/ char *sge_bin2string(FILE *fp, int size) { int i, fd; char inbuf[BUFFER], outbuf[2*BUFFER]; char *inp, *outp; char *dstbuf; int len, /* length of current tmp buffer */ dstbuflen, /* total length of destination buffer */ chunksize, /* chunks for realloc */ lastpos, /* last position in destination buffer */ error; if ((fd = fileno(fp)) == -1) return NULL; chunksize = 20480; if (size <= 0) /* no idea about buffer, malloc in chunks */ size = chunksize; dstbuf = (char *) malloc(size+1); dstbuflen = size; lastpos = 0; error = false; while (!error) { i = read(fd, inbuf, BUFFER); if (i > 0) { inp = inbuf; outp = outbuf; while (inp < &inbuf[i]) { if (*inp == '\\') { *outp++ = '\\'; *outp++ = '\\'; } else if (*inp == '\0') { *outp++ = '\\'; *outp++ = '0'; } else *outp++ = *inp; inp++; } len = outp - outbuf; if (lastpos + len > dstbuflen) { if ((dstbuf = sge_realloc(dstbuf, lastpos + len + chunksize, 0)) == NULL) { error = true; break; } dstbuflen = lastpos + len + chunksize; } memcpy(&dstbuf[lastpos], outbuf, len); lastpos += len; } else if (i == 0) { break; } else { if (errno != EINTR) { error=true; break; } } } if (error) { sge_free(&dstbuf); return NULL; } else { if ((dstbuf = sge_realloc(dstbuf, lastpos + 1, 0)) == NULL) { return NULL; } dstbuf[lastpos] = '\0'; return dstbuf; } }
/****** uti/spool/sge_get_management_entry() ************************************* * NAME * sge_get_management_entry() - Read management.properties file entries * * SYNOPSIS * int sge_get_management_entry(const char *fname, int n, * const char *name[], * char value[][1025], * dstring *error_dstring) * * FUNCTION * Reads in an array of configuration file entries * * RESULT * int - 0 on success * * BUGS * Function can not differ multiple similar named entries. * * NOTES * MT-NOTE: sge_get_management_entry() is MT safe ******************************************************************************/ int sge_get_management_entry(const char *fname, int n, int nmissing, bootstrap_entry_t name[], char value[][SGE_PATH_MAX], dstring *error_dstring) { FILE *fp; char buf[SGE_PATH_MAX], *cp; int i; bool *is_found = NULL; DENTER(TOP_LAYER, "sge_get_management_entry"); if (!(fp = fopen(fname, "r"))) { if (error_dstring == NULL){ CRITICAL((SGE_EVENT, MSG_FILE_FOPENFAILED_SS, fname, strerror(errno))); } else { sge_dstring_sprintf(error_dstring, MSG_FILE_FOPENFAILED_SS, fname, strerror(errno)); } DEXIT; return n; } is_found = malloc(sizeof(bool) * n); memset(is_found, false, n * sizeof(bool)); while (fgets(buf, sizeof(buf), fp)) { char *pos = NULL; /* set chrptr to the first non blank character * If line is empty continue with next line */ if(!(cp = strtok_r(buf, " \t\n", &pos))) { continue; } /* allow commentaries */ if (cp[0] == '#') { continue; } /* search for all requested configuration values */ for (i=0; i<n; i++) { char *nam = strtok_r(cp, "=", &pos); char *val = strtok_r(NULL, "\n", &pos); if (nam != NULL && strcasecmp(name[i].name, nam) == 0) { DPRINTF(("nam = %s\n", nam)); if (val != NULL) { DPRINTF(("val = %s\n", val)); sge_strlcpy(value[i], val, SGE_PATH_MAX); } else { sge_strlcpy(value[i], "", SGE_PATH_MAX); } is_found[i] = true; if (name[i].is_required) { --nmissing; } break; } } } if (nmissing != 0) { for (i=0; i<n; i++) { if (!is_found[i] && name[i].is_required) { if (error_dstring == NULL){ CRITICAL((SGE_EVENT, MSG_UTI_CANNOTLOCATEATTRIBUTEMAN_SS, name[i].name, fname)); } else { sge_dstring_sprintf(error_dstring, MSG_UTI_CANNOTLOCATEATTRIBUTEMAN_SS, name[i].name, fname); } break; } } } sge_free(&is_found); FCLOSE(fp); DEXIT; return nmissing; FCLOSE_ERROR: DEXIT; return 0; } /* sge_get_management_entry() */