/****** gdi/request_internal/sge_gdi_packet_get_pb_size() ******************** * NAME * sge_gdi_packet_get_pb_size() -- returns the needed puckbuffer size * * SYNOPSIS * u_long32 sge_gdi_packet_get_pb_size(sge_gdi_packet_class_t *packet) * * FUNCTION * This function checks how lage a packbuffer needs to be so that * the given "packet" can be packed completely. * * This operation is needed to avoid continuous reallocation * when the real buffer should be allocated. * * INPUTS * sge_gdi_packet_class_t *packet - packet pointer * * RESULT * u_long32 - size in byte * * NOTES * MT-NOTE: sge_gdi_packet_get_pb_size() is MT safe * * SEE ALSO * gdi/request_internal/sge_gdi_packet_pack_task() *******************************************************************************/ u_long32 sge_gdi_packet_get_pb_size(sge_gdi_packet_class_t *packet) { u_long32 ret = 0; DENTER(TOP_LAYER, "sge_gdi_packet_get_pb_size"); if (packet != NULL) { bool local_ret; lList *local_answer_list = NULL; sge_pack_buffer pb; init_packbuffer(&pb, 0, 1); local_ret = sge_gdi_packet_pack(packet, &local_answer_list, &pb); if (local_ret) { ret = pb_used(&pb); } clear_packbuffer(&pb); } DRETURN(ret); }
int sge_send_ack_to_qmaster(sge_gdi_ctx_class_t *ctx, u_long32 type, u_long32 ulong_val, u_long32 ulong_val_2, const char *str, lList **alpp) { int ret; sge_pack_buffer pb; const char* commproc = prognames[QMASTER]; const char* rhost = ctx->get_master(ctx, false); int id = 1; DENTER(TOP_LAYER, "sge_gdi2_send_ack_to_qmaster"); /* send an ack to the qmaster for the events */ if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) { DRETURN(CL_RETVAL_MALLOC); } pack_ack(&pb, type, ulong_val, ulong_val_2, str); ret = sge_gdi2_send_any_request(ctx, 0, NULL, rhost, commproc, id, &pb, TAG_ACK_REQUEST, 0, alpp); clear_packbuffer(&pb); answer_list_output (alpp); DRETURN(ret); }
void * sge_worker_main(void *arg) { bool do_endlessly = true; cl_thread_settings_t *thread_config = (cl_thread_settings_t*)arg; sge_gdi_ctx_class_t *ctx = NULL; monitoring_t monitor; monitoring_t *monitorp = &monitor; time_t next_prof_output = 0; DENTER(TOP_LAYER, "sge_worker_main"); DPRINTF(("started")); cl_thread_func_startup(thread_config); sge_monitor_init(&monitor, thread_config->thread_name, GDI_EXT, MT_WARNING, MT_ERROR); sge_qmaster_thread_init(&ctx, QMASTER, WORKER_THREAD, true); /* register at profiling module */ set_thread_name(pthread_self(), "Worker Thread"); conf_update_thread_profiling("Worker Thread"); while (do_endlessly) { sge_gdi_packet_class_t *packet = NULL; /* * Wait for packets. As long as packets are available cancelation * of this thread is ignored. The shutdown procedure in the main * thread takes care that packet producers will be terminated * before all worker threads so that this won't be a problem. */ MONITOR_IDLE_TIME( sge_tq_wait_for_task(Master_Task_Queue, 1, SGE_TQ_GDI_PACKET, (void *)&packet), &monitor, mconf_get_monitor_time(), mconf_is_monitor_message()); MONITOR_SET_QLEN((monitorp), sge_tq_get_task_count(Master_Task_Queue)); if (packet != NULL) { sge_gdi_task_class_t *task = packet->first_task; bool is_only_read_request = true; thread_start_stop_profiling(); #ifdef SEND_ANSWER_IN_LISTENER #else /* * prepare buffer for sending an answer */ if (packet->is_intern_request == false && packet->is_gdi_request == true) { init_packbuffer(&(packet->pb), 0, 0); } #endif MONITOR_MESSAGES((monitorp)); if (packet->is_gdi_request == true) { /* * test if a write lock is necessary */ task = packet->first_task; while (task != NULL) { u_long32 command = SGE_GDI_GET_OPERATION(task->command); if (command != SGE_GDI_GET) { is_only_read_request = false; break; } task = task->next; } } else { is_only_read_request = false; } /* * acquire the correct lock */ if (is_only_read_request) { MONITOR_WAIT_TIME(SGE_LOCK(LOCK_GLOBAL, LOCK_READ), monitorp); } else { MONITOR_WAIT_TIME(SGE_LOCK(LOCK_GLOBAL, LOCK_WRITE), monitorp); } if (packet->is_gdi_request == true) { /* * do the GDI request */ task = packet->first_task; while (task != NULL) { sge_c_gdi(ctx, packet, task, &(task->answer_list), &monitor); task = task->next; } } else { task = packet->first_task; sge_c_report(ctx, packet->host, packet->commproc, packet->commproc_id, task->data_list, &monitor); } /* * do unlock */ if (is_only_read_request) { SGE_UNLOCK(LOCK_GLOBAL, LOCK_READ) } else { SGE_UNLOCK(LOCK_GLOBAL, LOCK_WRITE) } if (packet->is_gdi_request == true) { #ifdef SEND_ANSWER_IN_LISTENER sge_gdi_packet_broadcast_that_handled(packet); #else /* * Send the answer to the client */ if (packet->is_intern_request == false) { MONITOR_MESSAGES_OUT(monitorp); sge_gdi2_send_any_request(ctx, 0, NULL, packet->host, packet->commproc, packet->commproc_id, &(packet->pb), TAG_GDI_REQUEST, packet->response_id, NULL); clear_packbuffer(&(packet->pb)); # ifdef BLOCK_LISTENER sge_gdi_packet_broadcast_that_handled(packet); # else sge_gdi_packet_free(&packet); # endif /* * Code only for TS: * * Following if-block will only be executed in testsuite if the qmaster * parameter __TEST_SLEEP_AFTER_REQUEST is defined. This will block the * worker thread if it handled a request. Only this makes sure that * other worker threads can handle incoming requests. Otherwise * it might be possible that one worker threads handles all requests * on fast qmaster hosts if testsuite is not fast enough to generate * gdi requests. */ if (mconf_get_enable_test_sleep_after_request() == true) { sleep(5); } } else { sge_gdi_packet_broadcast_that_handled(packet); /* this is an internal request, packet will get destroyed later, * where the caller waits for the answer * make sure it is no longer accessed here */ packet = NULL; } #endif } else { sge_gdi_packet_free(&packet); } thread_output_profiling("worker thread profiling summary:\n", &next_prof_output); sge_monitor_output(&monitor); } else {
/****** sge_c_report() ******************************************************* * NAME * sge_c_report() -- process execd load report * * SYNOPSIS * void sge_c_report(char *rhost, char *commproc, int id, lList *report_list) * * FUNCTION * * INPUTS * char *rhost * char *commproc * int id * lList *report_list * * RESULT * void - nothing * * NOTES * MT-NOTE: sge_c_report() is MT safe * ******************************************************************************/ void sge_c_report(sge_gdi_ctx_class_t *ctx, char *rhost, char *commproc, int id, lList *report_list, monitoring_t *monitor) { lListElem *hep = NULL; u_long32 rep_type; lListElem *report; int ret = 0; u_long32 this_seqno, last_seqno; u_long32 rversion; sge_pack_buffer pb; bool is_pb_used = false; bool send_tag_new_conf = false; DENTER(TOP_LAYER, "sge_c_report"); if (lGetNumberOfElem(report_list) == 0) { DPRINTF(("received empty report\n")); if (rhost != NULL) { WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, rhost)); } else { WARNING((SGE_EVENT, MSG_QMASTER_RECEIVED_EMPTY_LOAD_REPORT_S, "unknown")); } DRETURN_VOID; } /* accept reports only from execd's */ if (strcmp(prognames[EXECD], commproc)) { ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNCOMMPROC_S, commproc)); DRETURN_VOID; } /* do not process load reports from old execution daemons */ rversion = lGetUlong(lFirst(report_list), REP_version); if (verify_request_version(NULL, rversion, rhost, commproc, id)) { DRETURN_VOID; } this_seqno = lGetUlong(lFirst(report_list), REP_seqno); /* need exec host for all types of reports */ if (!(hep = host_list_locate(*object_type_get_master_list(SGE_TYPE_EXECHOST), rhost))) { ERROR((SGE_EVENT, MSG_GOTSTATUSREPORTOFUNKNOWNEXECHOST_S, rhost)); DRETURN_VOID; } /* prevent old reports being proceeded frequent loggings of outdated reports can be an indication of too high message traffic arriving at qmaster */ last_seqno = lGetUlong(hep, EH_report_seqno); if ((this_seqno < last_seqno && (last_seqno - this_seqno) <= 9000) && !(last_seqno > 9990 && this_seqno < 10)) { /* this must be an old report, log and then ignore it */ INFO((SGE_EVENT, MSG_QMASTER_RECEIVED_OLD_LOAD_REPORT_UUS, sge_u32c(this_seqno), sge_u32c(last_seqno), rhost)); DRETURN_VOID; } lSetUlong(hep, EH_report_seqno, this_seqno); /* RU: */ /* tag all reschedule_unknown list entries we hope to hear about in that job report */ update_reschedule_unknown_list(ctx, hep); /* ** process the reports one after the other ** usually there will be a load report ** and a configuration version report */ for_each(report, report_list) { rep_type = lGetUlong(report, REP_type); switch (rep_type) { case NUM_REP_REPORT_LOAD: case NUM_REP_FULL_REPORT_LOAD: MONITOR_ELOAD(monitor); /* Now handle execds load reports */ if (lGetUlong(hep, EH_lt_heard_from) == 0 && rep_type != NUM_REP_FULL_REPORT_LOAD) { host_notify_about_full_load_report(ctx, hep); } else { if (!is_pb_used) { is_pb_used = true; init_packbuffer(&pb, 1024, 0); } sge_update_load_values(ctx, rhost, lGetList(report, REP_list)); if (mconf_get_simulate_execds()) { lList *master_exechost_list = *object_type_get_master_list(SGE_TYPE_EXECHOST); lListElem *shep; lListElem *simhostElem=NULL; for_each(shep, master_exechost_list) { simhostElem = lGetSubStr(shep, CE_name, "load_report_host", EH_consumable_config_list); if (simhostElem != NULL) { const char *real_host = lGetString(simhostElem, CE_stringval); if (real_host != NULL && sge_hostcmp(real_host, rhost) == 0) { const char* sim_host = lGetHost(shep, EH_name); lListElem *clp = NULL; DPRINTF(("Copy load values of %s to simulated host %s\n", rhost, sim_host)); for_each(clp, lGetList(report, REP_list)) { if (strcmp(lGetHost(clp, LR_host), SGE_GLOBAL_NAME) != 0) { lSetHost(clp, LR_host, sim_host); } } sge_update_load_values(ctx, sim_host, lGetList(report, REP_list)); } } } } pack_ack(&pb, ACK_LOAD_REPORT, this_seqno, 0, NULL); } break; case NUM_REP_REPORT_CONF: MONITOR_ECONF(monitor); if (sge_compare_configuration(hep, lGetList(report, REP_list)) != 0) { DPRINTF(("%s: configuration on host %s is not up to date\n", SGE_FUNC, rhost)); send_tag_new_conf = true; } break; case NUM_REP_REPORT_PROCESSORS: /* ** save number of processors */ MONITOR_EPROC(monitor); ret = update_license_data(ctx, hep, lGetList(report, REP_list)); if (ret) { ERROR((SGE_EVENT, MSG_LICENCE_ERRORXUPDATINGLICENSEDATA_I, ret)); } break; case NUM_REP_REPORT_JOB: MONITOR_EJOB(monitor); if (!is_pb_used) { is_pb_used = true; init_packbuffer(&pb, 1024, 0); } process_job_report(ctx, report, hep, rhost, commproc, &pb, monitor); break; default: DPRINTF(("received invalid report type %ld\n", (long) rep_type)); }
/****** gdi/sge/sge_qexecve() ************************************************ * NAME * sge_qexecve() -- start a task in a tightly integrated par. job * * SYNOPSIS * sge_tid_t sge_qexecve(const char *hostname, const char *queuename, * const char *cwd, const lList *environment * const lList *path_aliases) * * FUNCTION * Starts a task in a tightly integrated job. * Builds a job object describing the task, * connects to the commd on the targeted execution host, * deliveres the job object and waits for an answer. * The answer from the execution daemon on the execution host * contains a task id that is returned to the caller of the function. * * INPUTS * const char *hostname - name of the host on which to start the task * const lList *environment - list containing environment variable * settings for the task that override the * default environment * const lList *path_aliases - optional a path alias list * * RESULT * sge_tid_t - the task id, if the task can be executed, * a value <= 0 indicates an error. * * NOTES * MT-NOTE: sge_qexecve() is not MT safe ******************************************************************************/ sge_tid_t sge_qexecve(sge_gdi_ctx_class_t *ctx, const char *hostname, const char *queuename, const char *cwd, const lList *environment, const lList *path_aliases) { char myname[256]; const char *s; int ret, uid; sge_tid_t tid = NULL; lListElem *petrep; lListElem *rt; sge_pack_buffer pb; u_long32 jobid, jataskid; u_long32 dummymid = 0; const char *env_var_name = "SGE_TASK_ID"; DENTER(TOP_LAYER, "sge_qexecve"); if (hostname == NULL) { sprintf(lasterror, MSG_GDI_INVALIDPARAMETER_SS, "sge_qexecve", "hostname"); DRETURN(NULL); } /* resolve user */ if (sge_uid2user((uid=getuid()), myname, sizeof(myname)-1, MAX_NIS_RETRIES)) { sprintf(lasterror, MSG_GDI_RESOLVINGUIDTOUSERNAMEFAILED_IS , uid, strerror(errno)); DRETURN(NULL); } if ((s=getenv("JOB_ID")) == NULL) { sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, "JOB_ID"); DRETURN(NULL); } if (sscanf(s, sge_u32, &jobid) != 1) { sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, "JOB_ID"); DRETURN(NULL); } if ((s=getenv(env_var_name)) != NULL) { if (strcmp(s, "undefined") == 0) { jataskid = 1; } else { if (sscanf(s, sge_u32, &jataskid) != 1) { sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, env_var_name); DRETURN(NULL); } } } else { sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, env_var_name); DRETURN(NULL); } /* ---- build up pe task request structure (see gdilib/sge_petaskL.h) */ petrep = lCreateElem(PETR_Type); lSetUlong(petrep, PETR_jobid, jobid); lSetUlong(petrep, PETR_jataskid, jataskid); lSetString(petrep, PETR_owner, myname); lSetUlong(petrep, PETR_submission_time, sge_get_gmt()); if (cwd != NULL) { lSetString(petrep, PETR_cwd, cwd); } if (environment != NULL) { lSetList(petrep, PETR_environment, lCopyList("environment", environment)); } if (path_aliases != NULL) { lSetList(petrep, PETR_path_aliases, lCopyList("path_aliases", path_aliases)); } if (queuename != NULL) { lSetString(petrep, PETR_queuename, queuename); } if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) { lFreeElem(&petrep); sprintf(lasterror, SFNMAX, MSG_GDI_OUTOFMEMORY); DRETURN(NULL); } pack_job_delivery(&pb, petrep); ret = gdi2_send_message_pb(ctx, 1, prognames[EXECD], 1, hostname, TAG_JOB_EXECUTION, &pb, &dummymid); clear_packbuffer(&pb); lFreeElem(&petrep); if (ret != CL_RETVAL_OK) { sprintf(lasterror, MSG_GDI_SENDTASKTOEXECDFAILED_SS, hostname, cl_get_error_text(ret)); DRETURN(NULL); } /* add list into our remote task list */ rt = lAddElemStr(&remote_task_list, RT_tid, "none", RT_Type); lSetHost(rt, RT_hostname, hostname); lSetUlong(rt, RT_state, RT_STATE_WAIT4ACK); rcv_from_execd(ctx, OPT_SYNCHRON, TAG_JOB_EXECUTION); tid = (sge_tid_t) lGetString(rt, RT_tid); if (strcmp(tid, "none") == 0) { tid = NULL; sprintf(lasterror, MSG_GDI_EXECDONHOSTDIDNTACCEPTTASK_S, hostname); } /* now close message to execd */ cl_commlib_shutdown_handle(cl_com_get_handle("execd_handle", 0), false); DRETURN(tid); }