Ejemplo n.º 1
0
/****** qmaster/sge_qmaster_main/main() ****************************************
*  NAME
*     main() -- qmaster entry point 
*
*  SYNOPSIS
*     int main(int argc, char* argv[]) 
*
*  FUNCTION
*     Qmaster entry point.
*
*     NOTE: The main thread must block all signals before any additional thread
*     is created. Failure to do so will ruin signal handling!
*
*  INPUTS
*     int argc     - number of commandline arguments 
*     char* argv[] - commandline arguments 
*
*  RESULT
*     0 - success 
*
*  NOTES
*     We check whether 'SGE_ROOT' is set before we daemonize. Once qmaster is
*     a daemon, we are no longer connected to a terminal and hence can not
*     output an error message to stdout or stderr.
*
*     We need to inovke 'prepare_enroll()' *before* the user id is switched via
*     'become_admin_user()'. This is because qmaster must be able to bind a so
*     called reserved port (requires root privileges) if configured to do so.
*
*******************************************************************************/
int main(int argc, char* argv[])
{
   int max_enroll_tries;
   int ret_val;
   int file_descriptor_settings_result = 0;
   bool has_daemonized = false;
   sge_gdi_ctx_class_t *ctx = NULL;
   u_long32 start_time = sge_get_gmt();
   monitoring_t monitor;

   DENTER_MAIN(TOP_LAYER, "qmaster");

   sge_monitor_init(&monitor, "MAIN", NONE_EXT, MT_WARNING, MT_ERROR);
   prof_mt_init();

   sge_get_root_dir(true, NULL, 0, true);
   
#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   sge_init_language_func((gettext_func_type)gettext, (setlocale_func_type)setlocale, (bindtextdomain_func_type)bindtextdomain, (textdomain_func_type)textdomain);
   sge_init_language(NULL,NULL);   
#endif 

   /* 
    * qmaster doesn't support any commandline anymore,
    * but we should show version string and -help option 
    */
   if (argc != 1) {
      sigset_t sig_set;
      sigfillset(&sig_set);
      pthread_sigmask(SIG_SETMASK, &sig_set, NULL);
      sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, true);
      sge_process_qmaster_cmdline(argv);
      SGE_EXIT((void**)&ctx, 1);
   }

   /*
    * daemonize qmaster
    * set file descriptor limits
    * and initialize libraries to be used in multi threaded environment
    * also take care that finished child processed of this process become
    * zombie jobs
    */
   has_daemonized = sge_daemonize_qmaster();
   file_descriptor_settings_result = set_file_descriptor_limit();
#if !defined(INTERIX) && !defined(CYGWIN)
   init_sig_action_and_mask();
#endif

   /* init qmaster threads without becomming admin user */
   sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, false);

   ctx->set_daemonized(ctx, has_daemonized);

   /* this must be done as root user to be able to bind ports < 1024 */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;
      if (max_enroll_tries <= 0) {
         /* exit after 30 seconds */
         CRITICAL((SGE_EVENT, MSG_QMASTER_COMMUNICATION_ERRORS ));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
      }
   }

   /*
    * now the commlib up and running. Set qmaster application status function 
    * (commlib callback function for qping status information response 
    *  messages (SIRM))
    */
   ret_val = cl_com_set_status_func(sge_qmaster_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
   }

   /* 
    * now we become admin user change into the correct root directory set the
    * the target for logging messages
    */
   sge_become_admin_user(ctx->get_admin_user(ctx));
   sge_chdir_exit(ctx->get_qmaster_spool_dir(ctx), 1);
   log_state_set_log_file(ERR_FILE);
   ctx->set_exit_func(ctx, sge_exit_func);

#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /*
    * We do increment the heartbeat manually here. This is the 'startup heartbeat'. 
    * The first time the hearbeat will be incremented through the heartbeat event 
    * handler is after about HEARTBEAT_INTERVAL seconds. The hardbeat event handler
    * is setup during the initialisazion of the timer thread.
    */
   inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, HEARTBEAT_INTERVAL, NULL);
     
   /*
    * Event master module has to be initialized already here because
    * sge_setup_qmaster() might already access it although event delivery
    * thread is not running.
    *
    * Corresponding shutdown is done in sge_event_master_terminate();
    *
    * EB: In my opinion the init function should called in
    * sge_event_master_initialize(). Is it possible to move that call?
    */ 
   sge_event_master_init();

   sge_setup_qmaster(ctx, argv);

#ifndef USE_POLL
   if (file_descriptor_settings_result == 1) {
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_LARGER_THAN_LIMIT_U, sge_u32c(FD_SETSIZE)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE1_U, sge_u32c(FD_SETSIZE - 20)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE2));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE3));
   }
#endif

   /*
    * Setup all threads and initialize corresponding modules. 
    * Order is important!
    */
   sge_signaler_initialize(ctx);
   sge_event_master_initialize(ctx);
   sge_timer_initialize(ctx, &monitor);
   sge_worker_initialize(ctx);
#if 0
   sge_test_initialize(ctx);
#endif
   sge_listener_initialize(ctx);
   sge_scheduler_initialize(ctx, NULL);
#ifndef NO_JNI
   sge_jvm_initialize(ctx, NULL);
#endif

   INFO((SGE_EVENT, "qmaster startup took "sge_u32" seconds", sge_get_gmt() - start_time));

   /*
    * Block till signal from signal thread arrives us
    */
   sge_thread_wait_for_signal();

   /* 
    * Shutdown all threads and shutdown corresponding modules.
    * Order is important!
    */
#ifndef NO_JNI
   sge_jvm_terminate(ctx, NULL);
#endif
   sge_scheduler_terminate(ctx, NULL);
   sge_listener_terminate();
#if 0
   sge_test_terminate(ctx);
#endif
   sge_worker_terminate(ctx);
   sge_timer_terminate();
   sge_event_master_terminate();
   sge_signaler_terminate();

   /*
    * Remaining shutdown operations
    */
   sge_clean_lists();
   sge_monitor_free(&monitor);

   sge_shutdown((void**)&ctx, sge_qmaster_get_exit_state());
   sge_prof_cleanup();

   DEXIT;
   return 0;
} /* main() */
Ejemplo n.º 2
0
/****** gdi/sge/sge_qexecve() ************************************************
*  NAME
*     sge_qexecve() -- start a task in a tightly integrated par. job
*
*  SYNOPSIS
*     sge_tid_t sge_qexecve(const char *hostname, const char *queuename,
*                           const char *cwd, const lList *environment
*                           const lList *path_aliases)
*
*  FUNCTION
*     Starts a task in a tightly integrated job.
*     Builds a job object describing the task,
*     connects to the commd on the targeted execution host,
*     deliveres the job object and waits for an answer.
*     The answer from the execution daemon on the execution host
*     contains a task id that is returned to the caller of the function.
*
*  INPUTS
*     const char *hostname - name of the host on which to start the task
*     const lList *environment  - list containing environment variable
*                            settings for the task that override the
*                            default environment
*     const lList *path_aliases - optional a path alias list
*
*  RESULT
*     sge_tid_t - the task id, if the task can be executed,
*                 a value <= 0 indicates an error.
*
*  NOTES
*     MT-NOTE: sge_qexecve() is not MT safe
******************************************************************************/
sge_tid_t sge_qexecve(sge_gdi_ctx_class_t *ctx,
                      const char *hostname, const char *queuename,
                      const char *cwd, const lList *environment,
                      const lList *path_aliases)
{
    char myname[256];
    const char *s;
    int ret, uid;
    sge_tid_t tid = NULL;
    lListElem *petrep;
    lListElem *rt;
    sge_pack_buffer pb;
    u_long32 jobid, jataskid;
    u_long32 dummymid = 0;
    const char *env_var_name = "SGE_TASK_ID";

    DENTER(TOP_LAYER, "sge_qexecve");

    if (hostname == NULL) {
        sprintf(lasterror, MSG_GDI_INVALIDPARAMETER_SS, "sge_qexecve", "hostname");
        DRETURN(NULL);
    }

    /* resolve user */
    if (sge_uid2user((uid=getuid()), myname, sizeof(myname)-1, MAX_NIS_RETRIES)) {
        sprintf(lasterror, MSG_GDI_RESOLVINGUIDTOUSERNAMEFAILED_IS ,
                uid, strerror(errno));
        DRETURN(NULL);
    }

    if ((s=getenv("JOB_ID")) == NULL) {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, "JOB_ID");
        DRETURN(NULL);
    }

    if (sscanf(s, sge_u32, &jobid) != 1) {
        sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, "JOB_ID");
        DRETURN(NULL);
    }

    if ((s=getenv(env_var_name)) != NULL) {
        if (strcmp(s, "undefined") == 0) {
            jataskid = 1;
        } else {
            if (sscanf(s, sge_u32, &jataskid) != 1) {
                sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, env_var_name);
                DRETURN(NULL);
            }
        }
    } else {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, env_var_name);
        DRETURN(NULL);
    }

    /* ---- build up pe task request structure (see gdilib/sge_petaskL.h) */
    petrep = lCreateElem(PETR_Type);

    lSetUlong(petrep, PETR_jobid, jobid);
    lSetUlong(petrep, PETR_jataskid, jataskid);
    lSetString(petrep, PETR_owner, myname);
    lSetUlong(petrep, PETR_submission_time, sge_get_gmt());

    if (cwd != NULL) {
        lSetString(petrep, PETR_cwd, cwd);
    }

    if (environment != NULL) {
        lSetList(petrep, PETR_environment, lCopyList("environment", environment));
    }

    if (path_aliases != NULL) {
        lSetList(petrep, PETR_path_aliases, lCopyList("path_aliases", path_aliases));
    }


    if (queuename != NULL) {
        lSetString(petrep, PETR_queuename, queuename);
    }

    if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) {
        lFreeElem(&petrep);
        sprintf(lasterror, SFNMAX, MSG_GDI_OUTOFMEMORY);
        DRETURN(NULL);
    }

    pack_job_delivery(&pb, petrep);

    ret = gdi2_send_message_pb(ctx, 1, prognames[EXECD], 1, hostname,
                               TAG_JOB_EXECUTION, &pb, &dummymid);

    clear_packbuffer(&pb);

    lFreeElem(&petrep);

    if (ret != CL_RETVAL_OK) {
        sprintf(lasterror, MSG_GDI_SENDTASKTOEXECDFAILED_SS, hostname, cl_get_error_text(ret));
        DRETURN(NULL);
    }

    /* add list into our remote task list */
    rt = lAddElemStr(&remote_task_list, RT_tid, "none", RT_Type);
    lSetHost(rt, RT_hostname, hostname);
    lSetUlong(rt, RT_state, RT_STATE_WAIT4ACK);

    rcv_from_execd(ctx, OPT_SYNCHRON, TAG_JOB_EXECUTION);

    tid = (sge_tid_t) lGetString(rt, RT_tid);

    if (strcmp(tid, "none") == 0) {
        tid = NULL;
        sprintf(lasterror, MSG_GDI_EXECDONHOSTDIDNTACCEPTTASK_S, hostname);
    }

    /* now close message to execd */
    cl_commlib_shutdown_handle(cl_com_get_handle("execd_handle", 0), false);

    DRETURN(tid);
}
extern int main(int argc, char** argv)
{
  struct sigaction sa;


  cl_com_handle_t* handle = NULL; 
  cl_com_message_t* message = NULL;
  cl_com_endpoint_t* sender = NULL;
#if 0
  cl_com_endpoint_t* clients[10] = { NULL, NULL, NULL, NULL, NULL,
                                     NULL, NULL, NULL, NULL, NULL };
#endif
  int i;
  unsigned long max_connections;
  
  if (argc != 4) {
      printf("please enter  debug level, port and nr. of max connections\n");
      exit(1);
  }

  /* setup signalhandling */
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = sighandler_server;  /* one handler for all signals */
  sigemptyset(&sa.sa_mask);
  sigaction(SIGINT, &sa, NULL);
  sigaction(SIGTERM, &sa, NULL);
  sigaction(SIGHUP, &sa, NULL);
  sigaction(SIGPIPE, &sa, NULL);


  printf("commlib setup ...\n");
  cl_com_setup_commlib(CL_RW_THREAD, (cl_log_t)atoi(argv[1]), NULL);

  printf("setting up service on port %d\n", atoi(argv[2]) );
  handle=cl_com_create_handle(NULL,CL_CT_TCP,CL_CM_CT_MESSAGE , CL_TRUE, atoi(argv[2]) , CL_TCP_DEFAULT,"server", 1, 2, 0 );
  if (handle == NULL) {
     printf("could not get handle\n");
     exit(-1);
  }

  cl_com_get_service_port(handle,&i), 
  printf("server running on host \"%s\", port %d, component name is \"%s\", id is %ld\n", 
         handle->local->comp_host, 
         i, 
         handle->local->comp_name,  
         handle->local->comp_id);

  cl_com_set_max_connections(handle,atoi(argv[3]));
  cl_com_get_max_connections(handle,&max_connections);
  printf("max open connections is set to %lu\n", max_connections);

  printf("enable max connection close\n");
  cl_com_set_max_connection_close_mode(handle, CL_ON_MAX_COUNT_CLOSE_AUTOCLOSE_CLIENTS);

  while(do_shutdown != 1) {
     unsigned long mid;
     int ret_val;
     struct timeval now;
     

     CL_LOG(CL_LOG_INFO,"main()");

     gettimeofday(&now,NULL);
     cl_commlib_trigger(handle, 1);
     ret_val = cl_commlib_receive_message(handle,NULL, NULL, 0, CL_FALSE, 0, &message, &sender);
     if (message != NULL ) {
        ret_val = cl_commlib_send_message(handle, 
                                sender->comp_host, 
                                sender->comp_name, 
                                sender->comp_id, CL_MIH_MAT_NAK,  
                                &message->message, 
                                message->message_length, 
                                &mid, message->message_id,0, 
                                CL_FALSE, CL_FALSE);
        if (ret_val != CL_RETVAL_OK) {
/*
           printf("cl_commlib_send_message() returned: %s\n",cl_get_error_text(ret_val));
*/
        } 
        

/*        printf("received message from \"%s\": size of message: %ld\n", sender->comp_host, message->message_length); */

        cl_com_free_message(&message);
        cl_com_free_endpoint(&sender);
        message = NULL;
     } 
  }


  cl_com_ignore_timeouts(CL_TRUE); 
  cl_com_get_ignore_timeouts_flag();

  printf("shutting down server ...\n");
  handle = cl_com_get_handle( "server", 1 );
  if (handle == NULL) {
     printf("could not find handle\n");
     exit(1);
  } else {
     printf("found handle\n");
  }

  while ( cl_commlib_shutdown_handle(handle, CL_TRUE) == CL_RETVAL_MESSAGE_IN_BUFFER) {
     message = NULL;
     cl_commlib_receive_message(handle, NULL, NULL, 0, CL_FALSE, 0, &message, &sender);

     if (message != NULL) {
        printf("ignoring message from \"%s\": size of message: %ld\n", sender->comp_host, message->message_length); 
        cl_com_free_message(&message);
        cl_com_free_endpoint(&sender);
        message = NULL;
     } else {
        break;
     }
  }

  printf("commlib cleanup ...\n");
  cl_com_cleanup_commlib();
  
  printf("main done\n");
  return 0;
}
Ejemplo n.º 4
0
      lListElem *id;

      if (lGetNumberOfElem(ref_list) == 0){
         id = lAddElemStr(&ref_list, ID_str, "0", ID_Type);
         lSetList(id, ID_user_list, user_list);
      } else {
         for_each(id, ref_list){
            lSetList(id, ID_user_list, user_list);
         }
      }
   }

   /* TODO: remove this code from client, should be hidden in gdi layer 
   **       timeout value should be set in gdi_setup
   */
   handle=cl_com_get_handle(prognames[QDEL], 0);
   cl_com_set_synchron_receive_timeout(handle, 10*60);

   /* Are there jobs which should be deleted? */
   if (!ref_list) {
      sge_usage(QDEL, stderr);
      printf("%s\n", MSG_PARSE_NOOPTIONARGUMENT);
      goto error_exit;
   }

   /* Has the user the permission to use the the '-f' (forced) flag */
   have_master_privileges = false;
   if (force == 1) {
      have_master_privileges = sge_gdi2_check_permission(ctx, &alp, MANAGER_CHECK);
      lFreeList(&alp);
   }
/****** qmaster/sge_mod_configuration() ****************************************
*  NAME
*     sge_mod_configuration() -- modify cluster configuration
*
*  SYNOPSIS
*     int sge_mod_configuration(lListElem *aConf, lList **anAnswer, char *aUser,
*                               char *aHost)
*
*  FUNCTION
*     Modify cluster configuration. 'confp' is a pointer to a 'CONF_Type' list
*     element and does contain the modified configuration entry. Adding a new
*     configuration entry is also viewed as a modification.
*
*  INPUTS
*     lListElem *aConf  - CONF_Type element containing the modified conf
*     lList **anAnswer  - answer list
*     char *aUser       - target user
*     char *aHost       - target host
*
*  RESULT
*     int - 0 success
*          -1 error
*
*  NOTES
*     MT-NOTE: sge_mod_configuration() is MT safe 
*
*******************************************************************************/
int sge_mod_configuration(sge_gdi_ctx_class_t *ctx, lListElem *aConf, lList **anAnswer, char *aUser, char *aHost)
{
   lListElem *old_conf;
   const char *tmp_name = NULL;
   char unique_name[CL_MAXHOSTLEN];
   int ret = -1;
   const char *cell_root = ctx->get_cell_root(ctx);
   const char *qualified_hostname = ctx->get_qualified_hostname(ctx);
   u_long32 progid = ctx->get_who(ctx);

   DENTER(TOP_LAYER, "sge_mod_configuration");

   if (!aConf || !aUser || !aHost) {
      CRITICAL((SGE_EVENT, MSG_SGETEXT_NULLPTRPASSED_S, SGE_FUNC));
      answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
      DRETURN(STATUS_EUNKNOWN);
   }

   if ((tmp_name = lGetHost(aConf, CONF_name)) == NULL) {
      CRITICAL((SGE_EVENT, MSG_SGETEXT_MISSINGCULLFIELD_SS, lNm2Str(CONF_name), SGE_FUNC));
      answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
      DRETURN(STATUS_EUNKNOWN);
   }

   if ((ret = sge_resolve_hostname(tmp_name, unique_name, EH_name, sizeof(unique_name)))
       != CL_RETVAL_OK) {
      DPRINTF(("%s: error %s resolving host %s\n", SGE_FUNC, cl_get_error_text(ret), tmp_name));
      ERROR((SGE_EVENT, MSG_SGETEXT_CANTRESOLVEHOST_S, tmp_name));
      answer_list_add(anAnswer, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
      DRETURN(STATUS_EUNKNOWN);
   }
   
   if ((ret = check_config(anAnswer, aConf))) {
      DRETURN(ret); 
   }

   if ((old_conf = sge_get_configuration_for_host(unique_name)) != NULL) {
      int ret = -1;
      
      ret = do_mod_config(ctx, unique_name, old_conf, aConf, anAnswer);
      
      lFreeElem(&old_conf);
      
      if (ret == 0) {    
         INFO((SGE_EVENT, MSG_SGETEXT_MODIFIEDINLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF));
         answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO);
      } else {
         DRETURN(STATUS_EUNKNOWN);
      }
   } else {
      do_add_config(ctx, unique_name, aConf, anAnswer);
            
      INFO((SGE_EVENT, MSG_SGETEXT_ADDEDTOLIST_SSSS, aUser, aHost, unique_name, MSG_OBJ_CONF));            
      answer_list_add(anAnswer, SGE_EVENT, STATUS_OK, ANSWER_QUALITY_INFO);
   }
   
   if (strcmp(SGE_GLOBAL_NAME, unique_name) == 0) {
      sge_add_event(0, sgeE_GLOBAL_CONFIG, 0, 0, NULL, NULL, NULL, NULL);
   }

   /*
   ** is the configuration change relevant for the qmaster itsself?
   ** if so, initialise conf struct anew
   */
   if (strcmp(unique_name, SGE_GLOBAL_NAME) == 0 || sge_hostcmp(unique_name, qualified_hostname) == 0) {
      lListElem *local = NULL;
      lListElem *global = NULL;
      lList *answer_list = NULL;
      char* qmaster_params = NULL;
      int accounting_flush_time = mconf_get_accounting_flush_time();

      if ((local = sge_get_configuration_for_host(qualified_hostname)) == NULL) {
         WARNING((SGE_EVENT, MSG_CONF_NOLOCAL_S, qualified_hostname));
      }
      
      if ((global = sge_get_configuration_for_host(SGE_GLOBAL_NAME)) == NULL) {
         ERROR((SGE_EVENT, SFNMAX, MSG_CONF_NOGLOBAL));
      }
            
      if (merge_configuration(&answer_list, progid, cell_root, global, local, NULL) != 0) {
         ERROR((SGE_EVENT, MSG_CONF_CANTMERGECONFIGURATIONFORHOST_S, qualified_hostname));
      }
      answer_list_output(&answer_list);

      /* Restart the accounting flush event if needed. */
      if ((accounting_flush_time == 0) &&
          (mconf_get_accounting_flush_time() != 0)) {
         te_event_t ev = te_new_event(time(NULL), TYPE_ACCOUNTING_TRIGGER, ONE_TIME_EVENT, 1, 0, NULL);
         te_add_event(ev);
         te_free_event(&ev);
      }
      
      lFreeElem(&local);
      lFreeElem(&global);
      
      sge_show_conf();

      /* 'max_unheard' may have changed */
      cl_commlib_set_connection_param(cl_com_get_handle("qmaster", 1), HEARD_FROM_TIMEOUT, mconf_get_max_unheard());

      /* fetching qmaster_params and begin to parse */
      qmaster_params = mconf_get_qmaster_params();

      /* updating the commlib paramterlist and gdi_timeout with new or changed parameters */
      cl_com_update_parameter_list(qmaster_params);

      sge_free(&qmaster_params);
   }
    
   /* invalidate configuration cache */
   mconf_set_new_config(true);
   
   DRETURN(STATUS_OK);
}
Ejemplo n.º 6
0
/*----------------------------------------------------------------------------*/
int 
main(int argc, char **argv)
{
   int heartbeat        = 0;
   int last_heartbeat   = 0;
   int latest_heartbeat = 0;
   int ret              = 0;
   int delay            = 0;
   time_t now, last;
/*    const char *cp; */
   char err_str[MAX_STRING_SIZE];
   char shadowd_pidfile[SGE_PATH_MAX];
   dstring ds;
   char buffer[256];
   pid_t shadowd_pid;

#if 1

static int check_interval = CHECK_INTERVAL;
static int get_active_interval = GET_ACTIVE_INTERVAL;
static int delay_time = DELAY_TIME;
static int sge_test_heartbeat = 0;

char binpath[SGE_PATH_MAX];
char oldqmaster[SGE_PATH_MAX];

char shadow_err_file[SGE_PATH_MAX];
char qmaster_out_file[SGE_PATH_MAX];

#endif

   lList *alp = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;

   DENTER_MAIN(TOP_LAYER, "sge_shadowd");
   
   sge_dstring_init(&ds, buffer, sizeof(buffer));
   /* initialize recovery control variables */
   {
      char *s;
      int val;
      if ((s=getenv("SGE_CHECK_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         check_interval = val;
      if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         get_active_interval = val;
      if ((s=getenv("SGE_DELAY_TIME")) &&
          sscanf(s, "%d", &val) == 1)
         delay_time = val;
      if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) &&
          sscanf(s, "%d", &val) == 1)
         sge_test_heartbeat = val;
   }
         
   /* This needs a better solution */
   umask(022);

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   log_state_set_log_file(TMP_ERR_FILE_SHADOWD);

   if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }

   /* AA: TODO: change this */
   ctx->set_exit_func(ctx, shadowd_exit_func);
   sge_setup_sig_handlers(SHADOWD);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   if (ctx->get_qmaster_spool_dir(ctx) != NULL) {
      char *shadowd_name = SGE_SHADOWD;

      /* is there a running shadowd on this host (with unqualified name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_unqualified_hostname(ctx));

      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }

      ctx->prepare_enroll(ctx);

      /* is there a running shadowd on this host (with aliased name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_qualified_hostname(ctx));
      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }  
   } else {
      ctx->prepare_enroll(ctx);
   }

   if (parse_cmdline_shadowd(argc, argv) == 1) {
      SGE_EXIT((void**)&ctx, 0);
   }
   
   if (ctx->get_qmaster_spool_dir(ctx) == NULL) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (chdir(ctx->get_qmaster_spool_dir(ctx))) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) {
      CRITICAL((SGE_EVENT, SFNMAX, err_str));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_switch2admin_user()) {
      CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER));
      SGE_EXIT((void**)&ctx, 1);
   }

   sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx));
   sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx));
   sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND);
   unlink(TMP_ERR_FILE_SHADOWD);
   log_state_set_log_as_admin_user(1);
   log_state_set_log_file(shadow_err_file);

   {
      int* tmp_fd_array = NULL;
      unsigned long tmp_fd_count = 0;

      if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) {
         sge_daemonize(tmp_fd_array, tmp_fd_count, ctx);
         if (tmp_fd_array != NULL) {
            sge_free(&tmp_fd_array);
         }
      } else {
         sge_daemonize(NULL, 0, ctx);
      }
   }

   /* shadowd pid file will contain aliased name */
   sge_write_pid(shadowd_pidfile);

   starting_up();
   
   sge_setup_sig_handlers(SHADOWD);

   last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

   last = (time_t) sge_get_gmt(); /* set time of last check time */

   delay = 0;
   while (!shut_me_down) {
      sleep(check_interval);

      /* get current heartbeat file content */
      heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

      now = (time_t) sge_get_gmt();


      /* Only check when we could read the heartbeat file at least two times
       * (last_heartbeat and heartbeat) without error 
       */
      if (last_heartbeat > 0 && heartbeat > 0) {

         /*
          * OK we have to heartbeat entries to check. Check times ...
          * now  = current time
          * last = last check time
          */
         if ( (now - last) >= (get_active_interval + delay) ) {

            delay = 0;
            if (last_heartbeat == heartbeat) {
               DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last)));
               delay = delay_time; /* set delay time */

               /*
                * check if we are a possible new qmaster host (lock file of qmaster active, etc.)
                */
               ret = check_if_valid_shadow(binpath, oldqmaster, 
                                           ctx->get_act_qmaster_file(ctx), 
                                           ctx->get_shadow_master_file(ctx), 
                                           ctx->get_qualified_hostname(ctx), 
                                           ctx->get_binary_path(ctx));

               if (ret == 0) {
                  /* we can start a qmaster on this host */
                  if (qmaster_lock(QMASTER_LOCK_FILE)) {
                     ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER));
                  } else {
                     int out, err;

                     /* still the old qmaster name in act_qmaster file and still the old heartbeat */
                     latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30);
                     /* TODO: what do we when there is a timeout ??? */
                     DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n"));
                     if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) &&
                         !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && 
                         (latest_heartbeat == heartbeat)) {
                        char qmaster_name[256];

                        strcpy(qmaster_name, SGE_PREFIX);
                        strcat(qmaster_name, prognames[QMASTER]); 
                        DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); 

                        /*
                         * open logfile as admin user for initial qmaster/schedd 
                         * startup messages
                         */
                        out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 
                                   0644);
                        err = out;
                        if (out == -1) {
                           /*
                            * First priority is the master restart
                            * => ignore this error
                            */
                           out = 1;
                           err = 2;
                        } 

                        sge_switch2start_user();
                        ret = startprog(out, err, NULL, binpath, qmaster_name, NULL);
                        sge_switch2admin_user();
                        if (ret) {
                           ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER));
                        }
                        close(out);
                     } else {
                        qmaster_unlock(QMASTER_LOCK_FILE);
                     }
                  }      
               } else {
                  if (ret == -1) {
                     /* just log the more important failures */    
                     WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) ));
                  }
               } 
            }
            /* Begin a new interval, set timers and hearbeat to current values */
            last = now;
            last_heartbeat = heartbeat;
         }
      } else {
         if (last_heartbeat < 0 || heartbeat < 0) {
            /* There was an error reading heartbeat or last_heartbeat */
            DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n",
                     sge_u32c(last_heartbeat), sge_u32c(heartbeat)));
         } else {
            DPRINTF(("have to read the heartbeat file twice to check time differences\n"));
         }
      }
   }

   sge_shutdown((void**)&ctx, 0);

   DRETURN(EXIT_SUCCESS);
}
Ejemplo n.º 7
0
/*-------------------------------------------------------------------------*/
int main(int argc, char **argv)
{
   int ret;
   int my_pid;
   int ret_val;
   int printed_points = 0;
   int max_enroll_tries;
   static char tmp_err_file_name[SGE_PATH_MAX];
   time_t next_prof_output = 0;
   int execd_exit_state = 0;
   lList **master_job_list = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;
   lList *alp = NULL;

   DENTER_MAIN(TOP_LAYER, "execd");

#if defined(LINUX)
   gen_procList ();
#endif

   prof_mt_init();

   set_thread_name(pthread_self(),"Execd Thread");

   prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); 
   prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); 

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   /* This needs a better solution */
   umask(022);
      
   /* Initialize path for temporary logging until we chdir to spool */
   my_pid = getpid();
   sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid));
   log_state_set_log_file(tmp_err_file_name);

   /* exit func for SGE_EXIT() */
   sge_sig_handler_in_main_loop = 0;
   sge_setup_sig_handlers(EXECD);

   if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }
   ctx->set_exit_func(ctx, execd_exit_func);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /* prepare daemonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_prepare(ctx);
   }

   if ((ret=sge_occupy_first_three())>=0) {
      CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret));
      SGE_EXIT((void**)&ctx, 1);
   }

   lInit(nmv);

   /* unset XAUTHORITY if set */
   if (getenv("XAUTHORITY") != NULL) {
      sge_unsetenv("XAUTHORITY");
   }

   parse_cmdline_execd(argv);   
   
   /* exit if we can't get communication handle (bind port) */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[EXECD],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;

      if (max_enroll_tries <= 0 || shut_me_down) {
         /* exit after 30 seconds */
         if (printed_points != 0) {
            printf("\n");
         }
         CRITICAL((SGE_EVENT, MSG_COM_ERROR));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[EXECD],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
        if (max_enroll_tries < 27) {
           printf(".");
           printed_points++;
           fflush(stdout);
        }
      }
   }

   if (printed_points != 0) {
      printf("\n");
   }

   /*
    * now the commlib up and running. Set execd application status function 
    * ( commlib callback function for qping status information response 
    *   messages (SIRM) )
    */
   ret_val = cl_com_set_status_func(sge_execd_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)) );
   }

   /* test connection */
   {
      cl_com_SIRM_t* status = NULL;
      ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx),
                                               (char *)ctx->get_master(ctx, true),
                                               (char*)prognames[QMASTER], 1, &status);
      if (ret_val != CL_RETVAL_OK) {
         ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
         ERROR((SGE_EVENT, MSG_CONF_NOCONFBG));
      }
      cl_com_free_sirm_message(&status);
   }
   
   /* finalize daeamonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_finalize(ctx);
   }

   /* daemonizes if qmaster is unreachable */   
   sge_setup_sge_execd(ctx, tmp_err_file_name);

   /* are we using qidle or not */
   sge_ls_qidle(mconf_get_use_qidle());
   sge_ls_gnu_ls(1);
   
   DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle()));

   /* test load sensor (internal or external) */
   {
      lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx));
      lFreeList(&report_list);
   }
   
   /* here we have to wait for qmaster registration */
   while (sge_execd_register_at_qmaster(ctx, false) != 0) {
      if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) {
         /* This is no error */
         DPRINTF(("*****  got SGE_COM_ACCESS_DENIED from qmaster  *****\n"));
      }
      if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) {
         execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE;
         break;
      }
      if (shut_me_down != 0) {
         break;
      }
      sleep(30);
   }

   /* 
    * Terminate on SIGTERM or hard communication error
    */
   if (execd_exit_state != 0 || shut_me_down != 0) {
      sge_shutdown((void**)&ctx, execd_exit_state);
      DRETURN(execd_exit_state);
   }

   /*
    * We write pid file when we are connected to qmaster. Otherwise an old
    * execd might overwrite our pidfile.
    */
   sge_write_pid(EXECD_PID_FILE);

   /*
    * At this point we are sure we are the only sge_execd and we are connected
    * to the current qmaster. First we have to report any reaped children
    * that might exist.
    */
   starting_up();

   /*
    * Log a warning message if execd hasn't been started by a superuser
    */
   if (!sge_is_start_user_superuser()) {
      WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT));
   }   

#ifdef COMPILE_DC
   if (ptf_init()) {
      CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF));
      SGE_EXIT((void**)&ctx, 1);
   }
   INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF));
#endif

   master_job_list = object_type_get_master_list(SGE_TYPE_JOB);
   *master_job_list = lCreateList("Master_Job_List", JB_Type);
   job_list_read_from_disk(master_job_list, "Master_Job_List",
                           0, SPOOL_WITHIN_EXECD, 
                          job_initialize_job);
   

   /* clean up jobs hanging around (look in active_dir) */
   clean_up_old_jobs(ctx, 1);
   execd_trash_load_report();
   sge_set_flush_lr_flag(true);

   sge_sig_handler_in_main_loop = 1;

   if (thread_prof_active_by_id(pthread_self())) {
      prof_start(SGE_PROF_CUSTOM1, NULL);
      prof_start(SGE_PROF_CUSTOM2, NULL);
      prof_start(SGE_PROF_GDI_REQUEST, NULL);
   } else {
      prof_stop(SGE_PROF_CUSTOM1, NULL);
      prof_stop(SGE_PROF_CUSTOM2, NULL);
      prof_stop(SGE_PROF_GDI_REQUEST, NULL);
   }

   PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1);

   /* Start dispatching */
   execd_exit_state = sge_execd_process_messages(ctx);


   /*
    * This code is only reached when dispatcher terminates and execd goes down.
    */

   /* log if we received SIGPIPE signal */
   if (sge_sig_handler_sigpipe_received) {
       sge_sig_handler_sigpipe_received = 0;
       INFO((SGE_EVENT, "SIGPIPE received\n"));
   }

#if defined(LINUX)
   free_procList();
#endif
   lFreeList(master_job_list);

   PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1);
   if (prof_is_active(SGE_PROF_ALL)) {
     time_t now = (time_t)sge_get_gmt();

      if (now > next_prof_output) {
         prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n");
         prof_reset(SGE_PROF_ALL,NULL);
         next_prof_output = now + 60;
      }
   }
   sge_prof_cleanup();

   sge_shutdown((void**)&ctx, execd_exit_state);
   DRETURN(execd_exit_state);
}