Exemple #1
0
int main(int ac, char **av)
{
    parse_args(ac, av);
    setup_sighandler(SIGINT, 0, on_sigint);
    setup_sighandler(SIGALRM, SA_RESTART, update_display);
    alarm(1);
    gl_env.last_update_time = time(NULL);
    gl_env.logtop = new_logtop(gl_env.history_size);
    if (!gl_env.quiet && !gl_env.line_by_line)
        curses_setup();
    run(gl_env.logtop);
    at_exit();
    return EXIT_SUCCESS;
}
static int lttng_live_open_trace_read(const char *path)
{
	int ret = 0;
	struct lttng_live_ctx *ctx;

	ctx = g_new0(struct lttng_live_ctx, 1);
	ctx->session = g_new0(struct lttng_live_session, 1);

	/* We need a pointer to the context from the packet_seek function. */
	ctx->session->ctx = ctx;

	/* HT to store the CTF traces. */
	ctx->session->ctf_traces = g_hash_table_new(g_direct_hash,
			g_direct_equal);
	ctx->port = -1;
	ctx->session_ids = g_array_new(FALSE, TRUE, sizeof(uint64_t));

	ret = parse_url(path, ctx);
	if (ret < 0) {
		goto end_free;
	}
#if 0
	ret = setup_sighandler();
	if (ret < 0) {
		goto end_free;
	}
#endif
	ret = lttng_live_connect_viewer(ctx);
	if (ret < 0) {
		goto end_free;
	}
	printf_verbose("LTTng-live connected to relayd\n");

	ret = lttng_live_establish_connection(ctx);
	if (ret < 0) {
		goto end_free;
	}

	printf_verbose("Listing sessions\n");
	ret = lttng_live_list_sessions(ctx, path);
	if (ret < 0) {
		goto end_free;
	}

	if (ctx->session_ids->len > 0) {
		ret = lttng_live_read(ctx);
	}

end_free:
	g_hash_table_destroy(ctx->session->ctf_traces);
	g_free(ctx->session);
	g_free(ctx->session->streams);
	g_free(ctx);

	if (lttng_live_should_quit()) {
		ret = 0;
	}
	return ret;
}
Exemple #3
0
int
main(int argc, char *argv[])
{
  char *domdot, *verstr, *vertmp;
  int ppid = 0;
  int error;
  char *progname = NULL;		/* "amd" */
  char hostname[MAXHOSTNAMELEN + 1] = "localhost"; /* Hostname */

  /*
   * Make sure some built-in assumptions are true before we start
   */
  assert(sizeof(nfscookie) >= sizeof(u_int));
  assert(sizeof(int) >= 4);

  /*
   * Set processing status.
   */
  amd_state = Start;

  /*
   * Determine program name
   */
  if (argv[0]) {
    progname = strrchr(argv[0], '/');
    if (progname && progname[1])
      progname++;
    else
      progname = argv[0];
  }
  if (!progname)
    progname = "amd";
  am_set_progname(progname);

  /*
   * Initialize process id.  This is kept
   * cached since it is used for generating
   * and using file handles.
   */
  am_set_mypid();

  /*
   * Get local machine name
   */
  if (gethostname(hostname, sizeof(hostname)) < 0) {
    plog(XLOG_FATAL, "gethostname: %m");
    going_down(1);
  }
  hostname[sizeof(hostname) - 1] = '\0';

  /*
   * Check it makes sense
   */
  if (!*hostname) {
    plog(XLOG_FATAL, "host name is not set");
    going_down(1);
  }

  /*
   * Initialize global options structure.
   */
  init_global_options();

  /*
   * Partially initialize hostd[].  This
   * is completed in get_args().
   */
  if ((domdot = strchr(hostname, '.'))) {
    /*
     * Hostname already contains domainname.
     * Split out hostname and domainname
     * components
     */
    *domdot++ = '\0';
    hostdomain = domdot;
  }
  xstrlcpy(hostd, hostname, sizeof(hostd));
  am_set_hostname(hostname);

  /*
   * Setup signal handlers
   */
  /* SIGINT: trap interrupts for shutdowns */
  setup_sighandler(SIGINT, sigterm);
  /* SIGTERM: trap terminate so we can shutdown cleanly (some chance) */
  setup_sighandler(SIGTERM, sigterm);
  /* SIGHUP: hangups tell us to reload the cache */
  setup_sighandler(SIGHUP, sighup);
  /*
   * SIGCHLD: trap Death-of-a-child.  These allow us to pick up the exit
   * status of backgrounded mounts.  See "sched.c".
   */
  setup_sighandler(SIGCHLD, sigchld);
#ifdef HAVE_SIGACTION
  /* construct global "masked_sigs" used in nfs_start.c */
  sigemptyset(&masked_sigs);
  sigaddset(&masked_sigs, SIGINT);
  sigaddset(&masked_sigs, SIGTERM);
  sigaddset(&masked_sigs, SIGHUP);
  sigaddset(&masked_sigs, SIGCHLD);
#endif /* HAVE_SIGACTION */

  /*
   * Fix-up any umask problems.  Most systems default
   * to 002 which is not too convenient for our purposes
   */
  orig_umask = umask(0);

  /*
   * Figure out primary network name
   */
  getwire(&PrimNetName, &PrimNetNum);

  /*
   * Determine command-line arguments
   */
  get_args(argc, argv);

  /*
   * Log version information.
   */
  vertmp = get_version_string();
  verstr = strtok(vertmp, "\n");
  plog(XLOG_INFO, "AM-UTILS VERSION INFORMATION:");
  while (verstr) {
    plog(XLOG_INFO, "%s", verstr);
    verstr = strtok(NULL, "\n");
  }
  XFREE(vertmp);

  /*
   * Get our own IP address so that we can mount the automounter.  We pass
   * localhost_address which could be used as the default localhost
   * name/address in amu_get_myaddress().
   */
  amu_get_myaddress(&myipaddr, gopt.localhost_address);
  plog(XLOG_INFO, "My ip addr is %s", inet_ntoa(myipaddr));

  /* avoid hanging on other NFS servers if started elsewhere */
  if (chdir("/") < 0)
    plog(XLOG_INFO, "cannot chdir to /: %m");

  /*
   * Now check we are root.
   */
  if (geteuid() != 0) {
    plog(XLOG_FATAL, "Must be root to mount filesystems (euid = %ld)", (long) geteuid());
    going_down(1);
  }

#ifdef HAVE_MAP_NIS
  /*
   * If the domain was specified then bind it here
   * to circumvent any default bindings that may
   * be done in the C library.
   */
  if (gopt.nis_domain && yp_bind(gopt.nis_domain)) {
    plog(XLOG_FATAL, "Can't bind to NIS domain \"%s\"", gopt.nis_domain);
    going_down(1);
  }
#endif /* HAVE_MAP_NIS */

  if (!amuDebug(D_DAEMON))
    ppid = daemon_mode();

  /*
   * Lock process text and data segment in memory.
   */
  if (gopt.flags & CFM_PROCESS_LOCK) {
    do_memory_locking();
  }

  do_mapc_reload = clocktime(NULL) + gopt.map_reload_interval;

  /*
   * Register automounter with system.
   */
  error = mount_automounter(ppid);
  if (error && ppid)
    kill(ppid, SIGALRM);

#ifdef HAVE_FS_AUTOFS
  /*
   * XXX this should be part of going_down(), but I can't move it there
   * because it would be calling non-library code from the library... ugh
   */
  if (amd_use_autofs)
    destroy_autofs_service();
#endif /* HAVE_FS_AUTOFS */

  going_down(error);

  abort();
  return 1; /* should never get here */
}
Exemple #4
0
int main (int argc, char**argv)
{
	int ret = 0;
	int heartbeat_usec = 50000; //20Hz is ok by default
	uint64_t last_beat = 0;

	Log_info ("cloudvpn starting");
	Log (0, "You are using CloudVPN, which is Free software.");
	Log (0, "For more information please see the GNU GPL license,");
	Log (0, "which you should have received along with this program.");

	setup_sighandler (kill_cloudvpn);

	/*
	 * initialization
	 */

	if (!config_parse (argc, argv) ) {
		Log_error ("failed to parse config, terminating.");
		ret = 1;
		goto failed_config;
	}

	if (!config_get_int ("heartbeat", heartbeat_usec) )
		heartbeat_usec = 50000;
	Log_info ("heartbeat is set to %d usec", heartbeat_usec);

	timestamp_update(); //get initial timestamp

	status_init();
	route_init();
	squeue_init();
	network_init();

	if (poll_init() ) {
		Log_fatal ("poll initialization failed");
		ret = 2;
		goto failed_poll;
	}

	if (do_memlock() ) {
		Log_fatal ("locking process to memory failed");
		ret = 3;
		goto failed_poll;
	}

	if (comm_load() ) {
		Log_fatal ("failed to load comm data");
		ret = 4;
		goto failed_poll;
	}

	if (comm_init() ) {
		Log_fatal ("communication initialization failed");
		ret = 5;
		goto failed_comm;
	}

	if (gate_init() ) {
		Log_fatal ("gate initialization failed");
		ret = 6;
		goto failed_gate;
	}
	if (do_chroot() ) {
		Log_fatal ("chrooting failed");
		ret = 7;
		goto failed_sec;
	}

	if (do_switch_user() ) {
		Log_fatal ("user switch failed");
		ret = 8;
		goto failed_sec;
	}


	/*
	 * main loop
	 */

	Log_info ("initialization complete, entering main loop");

	last_beat = 0; //update immediately.

	while (!g_terminate) {

		timestamp_update();

		if ( (timestamp() - last_beat)
		        < (unsigned int) heartbeat_usec) {
			//poll more stuff
			poll_wait_for_event (heartbeat_usec
			                     - timestamp()
			                     + last_beat);
			//send the results
			comm_flush_data();
			gate_flush_data();
			continue;
		}

		last_beat = timestamp();

		gate_periodic_update();
		comm_periodic_update();
		route_periodic_update();

		status_try_export();
	}

	/*
	 * deinitialization
	 */

	Log_info ("shutting down");

failed_sec:

	gate_shutdown();

failed_gate:

	comm_shutdown();

failed_comm:

	if (poll_deinit() )
		Log_warn ("poll_deinit somehow failed!");

failed_poll:
failed_config:
	if (!ret) Log_info ("cloudvpn exiting gracefully");
	else Log_error ("cloudvpn exiting with code %d", ret);
	return ret;
}
Exemple #5
0
int main(int argc, char **argv)
{
	int result;
	int error = FALSE;
	int display_license = FALSE;
	int display_help = FALSE;
	int c = 0;
	struct tm *tm, tm_s;
	time_t now;
	char datestring[256];
	nagios_macros *mac;
	const char *worker_socket = NULL;
	int i;

#ifdef HAVE_GETOPT_H
	int option_index = 0;
	static struct option long_options[] = {
		{"help", no_argument, 0, 'h'},
		{"version", no_argument, 0, 'V'},
		{"license", no_argument, 0, 'V'},
		{"verify-config", no_argument, 0, 'v'},
		{"daemon", no_argument, 0, 'd'},
		{"test-scheduling", no_argument, 0, 's'},
		{"precache-objects", no_argument, 0, 'p'},
		{"use-precached-objects", no_argument, 0, 'u'},
		{"enable-timing-point", no_argument, 0, 'T'},
		{"worker", required_argument, 0, 'W'},
		{0, 0, 0, 0}
	};
#define getopt(argc, argv, o) getopt_long(argc, argv, o, long_options, &option_index)
#endif

	memset(&loadctl, 0, sizeof(loadctl));
	mac = get_global_macros();

	/* make sure we have the correct number of command line arguments */
	if (argc < 2)
		error = TRUE;

	/* get all command line arguments */
	while (1) {
		c = getopt(argc, argv, "+hVvdspuxTW");

		if (c == -1 || c == EOF)
			break;

		switch (c) {

		case '?': /* usage */
		case 'h':
			display_help = TRUE;
			break;

		case 'V': /* version */
			display_license = TRUE;
			break;

		case 'v': /* verify */
			verify_config++;
			break;

		case 's': /* scheduling check */
			test_scheduling = TRUE;
			break;

		case 'd': /* daemon mode */
			daemon_mode = TRUE;
			break;

		case 'p': /* precache object config */
			precache_objects = TRUE;
			break;

		case 'u': /* use precached object config */
			use_precached_objects = TRUE;
			break;
		case 'T':
			enable_timing_point = TRUE;
			break;
		case 'W':
			worker_socket = optarg;
			break;

		case 'x':
			printf("Warning: -x is deprecated and will be removed\n");
			break;

		default:
			break;
		}

	}

	/* if we're a worker we can skip everything below */
	if (worker_socket) {
		exit(nagios_core_worker(worker_socket));
	}

	if (daemon_mode == FALSE) {
		printf("\nNaemon Core %s\n", PROGRAM_VERSION);
		printf("Copyright (c) 2013-present Naemon Core Development Team and Community Contributors\n");
		printf("Copyright (c) 2009-2013 Nagios Core Development Team and Community Contributors\n");
		printf("Copyright (c) 1999-2009 Ethan Galstad\n");
		printf("Last Modified: %s\n", PROGRAM_MODIFICATION_DATE);
		printf("License: GPL\n\n");
		printf("Website: http://www.naemon.org\n");
	}

	/* just display the license */
	if (display_license == TRUE) {

		printf("This program is free software; you can redistribute it and/or modify\n");
		printf("it under the terms of the GNU General Public License version 2 as\n");
		printf("published by the Free Software Foundation.\n\n");
		printf("This program is distributed in the hope that it will be useful,\n");
		printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
		printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
		printf("GNU General Public License for more details.\n\n");
		printf("You should have received a copy of the GNU General Public License\n");
		printf("along with this program; if not, write to the Free Software\n");
		printf("Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n\n");

		exit(OK);
	}

	/* make sure we got the main config file on the command line... */
	if (optind >= argc)
		error = TRUE;

	/* if there are no command line options (or if we encountered an error), print usage */
	if (error == TRUE || display_help == TRUE) {

		printf("Usage: %s [options] <main_config_file>\n", argv[0]);
		printf("\n");
		printf("Options:\n");
		printf("\n");
		printf("  -v, --verify-config          Verify all configuration data (-v -v for more info)\n");
		printf("  -s, --test-scheduling        Shows projected/recommended check scheduling and other\n");
		printf("                               diagnostic info based on the current configuration files.\n");
		printf("  -T, --enable-timing-point    Enable timed commentary on initialization\n");
		printf("  -x, --dont-verify-paths      Deprecated (Don't check for circular object paths)\n");
		printf("  -p, --precache-objects       Precache object configuration\n");
		printf("  -u, --use-precached-objects  Use precached object config file\n");
		printf("  -d, --daemon                 Starts Naemon in daemon mode, instead of as a foreground process\n");
		printf("  -W, --worker /path/to/socket Act as a worker for an already running daemon\n");
		printf("\n");
		printf("Visit the Naemon website at http://www.naemon.org/ for bug fixes, new\n");
		printf("releases, online documentation, FAQs and more...\n");
		printf("\n");

		exit(ERROR);
	}


	/*
	 * config file is last argument specified.
	 * Make sure it uses an absolute path
	 */
	config_file = nspath_absolute(argv[optind], NULL);
	if (config_file == NULL) {
		printf("Error allocating memory.\n");
		exit(ERROR);
	}

	config_file_dir = nspath_absolute_dirname(config_file, NULL);

	/*
	 * Set the signal handler for the SIGXFSZ signal here because
	 * we may encounter this signal before the other signal handlers
	 * are set.
	 */
	signal(SIGXFSZ, handle_sigxfsz);

	/*
	 * let's go to town. We'll be noisy if we're verifying config
	 * or running scheduling tests.
	 */
	if (verify_config || test_scheduling || precache_objects) {
		reset_variables();
		/*
		 * if we don't beef up our resource limits as much as
		 * we can, it's quite possible we'll run headlong into
		 * EAGAIN due to too many processes when we try to
		 * drop privileges later.
		 */
		set_loadctl_defaults();

		if (verify_config)
			printf("Reading configuration data...\n");

		/* read our config file */
		result = read_main_config_file(config_file);
		if (result != OK) {
			printf("   Error processing main config file!\n\n");
			exit(EXIT_FAILURE);
		}

		if (verify_config)
			printf("   Read main config file okay...\n");

		/* drop privileges */
		if ((result = drop_privileges(naemon_user, naemon_group)) == ERROR) {
			printf("   Failed to drop privileges.  Aborting.");
			exit(EXIT_FAILURE);
		}

		/*
		 * this must come after dropping privileges, so we make
		 * sure to test access permissions as the right user.
		 */
		if (test_configured_paths() == ERROR) {
			printf("   One or more path problems detected. Aborting.\n");
			exit(EXIT_FAILURE);
		}

		/* read object config files */
		result = read_all_object_data(config_file);
		if (result != OK) {
			printf("   Error processing object config files!\n\n");
			/* if the config filename looks fishy, warn the user */
			if (!strstr(config_file, "naemon.cfg")) {
				printf("\n***> The name of the main configuration file looks suspicious...\n");
				printf("\n");
				printf("     Make sure you are specifying the name of the MAIN configuration file on\n");
				printf("     the command line and not the name of another configuration file.  The\n");
				printf("     main configuration file is typically '%s'\n", DEFAULT_CONFIG_FILE);
			}

			printf("\n***> One or more problems was encountered while processing the config files...\n");
			printf("\n");
			printf("     Check your configuration file(s) to ensure that they contain valid\n");
			printf("     directives and data defintions.  If you are upgrading from a previous\n");
			printf("     version of Naemon, you should be aware that some variables/definitions\n");
			printf("     may have been removed or modified in this version.  Make sure to read\n");
			printf("     the HTML documentation regarding the config files, as well as the\n");
			printf("     'Whats New' section to find out what has changed.\n\n");
			exit(EXIT_FAILURE);
		}

		if (verify_config) {
			printf("   Read object config files okay...\n\n");
			printf("Running pre-flight check on configuration data...\n\n");
		}

		/* run the pre-flight check to make sure things look okay... */
		result = pre_flight_check();

		if (result != OK) {
			printf("\n***> One or more problems was encountered while running the pre-flight check...\n");
			printf("\n");
			printf("     Check your configuration file(s) to ensure that they contain valid\n");
			printf("     directives and data defintions.  If you are upgrading from a previous\n");
			printf("     version of Naemon, you should be aware that some variables/definitions\n");
			printf("     may have been removed or modified in this version.  Make sure to read\n");
			printf("     the HTML documentation regarding the config files, as well as the\n");
			printf("     'Whats New' section to find out what has changed.\n\n");
			exit(EXIT_FAILURE);
		}

		if (verify_config) {
			printf("\nThings look okay - No serious problems were detected during the pre-flight check\n");
		}

		/* scheduling tests need a bit more than config verifications */
		if (test_scheduling == TRUE) {

			/* we'll need the event queue here so we can time insertions */
			init_event_queue();
			timing_point("Done initializing event queue\n");

			/* read initial service and host state information */
			initialize_retention_data(config_file);
			read_initial_state_information();
			timing_point("Retention data and initial state parsed\n");

			/* initialize the event timing loop */
			init_timing_loop();
			timing_point("Timing loop initialized\n");

			/* display scheduling information */
			display_scheduling_info();
		}

		if (precache_objects) {
			result = fcache_objects(object_precache_file);
			timing_point("Done precaching objects\n");
			if (result == OK) {
				printf("Object precache file created:\n%s\n", object_precache_file);
			} else {
				printf("Failed to precache objects to '%s': %s\n", object_precache_file, strerror(errno));
			}
		}

		/* clean up after ourselves */
		cleanup();

		/* exit */
		timing_point("Exiting\n");

		/* make valgrind shut up about still reachable memory */
		neb_free_module_list();
		free(config_file_dir);
		free(config_file);

		exit(result);
	}


	/* start to monitor things... */

	/*
	 * if we're called with a relative path we must make
	 * it absolute so we can launch our workers.
	 * If not, we needn't bother, as we're using execvp()
	 */
	if (strchr(argv[0], '/'))
		naemon_binary_path = nspath_absolute(argv[0], NULL);
	else
		naemon_binary_path = strdup(argv[0]);

	if (!naemon_binary_path) {
		logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: Unable to allocate memory for naemon_binary_path\n");
		exit(EXIT_FAILURE);
	}

	if (!(nagios_iobs = iobroker_create())) {
		logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: Failed to create IO broker set: %s\n",
		      strerror(errno));
		exit(EXIT_FAILURE);
	}

	/* keep monitoring things until we get a shutdown command */
	do {
		/* reset internal book-keeping (in case we're restarting) */
		wproc_num_workers_spawned = wproc_num_workers_online = 0;
		caught_signal = sigshutdown = sigrestart = FALSE;
		sig_id = 0;

		/* reset program variables */
		reset_variables();
		timing_point("Variables reset\n");

		/* get PID */
		nagios_pid = (int)getpid();

		/* read in the configuration files (main and resource config files) */
		result = read_main_config_file(config_file);
		if (result != OK) {
			logit(NSLOG_CONFIG_ERROR, TRUE, "Error: Failed to process config file '%s'. Aborting\n", config_file);
			exit(EXIT_FAILURE);
		}
		timing_point("Main config file read\n");

		/* NOTE 11/06/07 EG moved to after we read config files, as user may have overridden timezone offset */
		/* get program (re)start time and save as macro */
		program_start = time(NULL);
		my_free(mac->x[MACRO_PROCESSSTARTTIME]);
		asprintf(&mac->x[MACRO_PROCESSSTARTTIME], "%lu", (unsigned long)program_start);

		/* drop privileges */
		if (drop_privileges(naemon_user, naemon_group) == ERROR) {

			logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Failed to drop privileges.  Aborting.");

			cleanup();
			exit(ERROR);
		}

		if (test_path_access(naemon_binary_path, X_OK)) {
			logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: failed to access() %s: %s\n", naemon_binary_path, strerror(errno));
			logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: Spawning workers will be impossible. Aborting.\n");
			exit(EXIT_FAILURE);
		}

		if (test_configured_paths() == ERROR) {
			/* error has already been logged */
			exit(EXIT_FAILURE);
		}
		/* enter daemon mode (unless we're restarting...) */
		if (daemon_mode == TRUE && sigrestart == FALSE) {

			result = daemon_init();

			/* we had an error daemonizing, so bail... */
			if (result == ERROR) {
				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR, TRUE, "Bailing out due to failure to daemonize. (PID=%d)", (int)getpid());
				cleanup();
				exit(EXIT_FAILURE);
			}

			/* get new PID */
			nagios_pid = (int)getpid();
		}

		/* this must be logged after we read config data, as user may have changed location of main log file */
		logit(NSLOG_PROCESS_INFO, TRUE, "Naemon %s starting... (PID=%d)\n", PROGRAM_VERSION, (int)getpid());

		/* log the local time - may be different than clock time due to timezone offset */
		now = time(NULL);
		tm = localtime_r(&now, &tm_s);
		strftime(datestring, sizeof(datestring), "%a %b %d %H:%M:%S %Z %Y", tm);
		logit(NSLOG_PROCESS_INFO, TRUE, "Local time is %s", datestring);

		/* write log version/info */
		write_log_file_info(NULL);

		/* open debug log now that we're the right user */
		open_debug_log();

#ifdef USE_EVENT_BROKER
		/* initialize modules */
		neb_init_modules();
		neb_init_callback_list();
#endif
		timing_point("NEB module API initialized\n");

		/* handle signals (interrupts) before we do any socket I/O */
		setup_sighandler();

		/*
		 * Initialize query handler and event subscription service.
		 * This must be done before modules are initialized, so
		 * the modules can use our in-core stuff properly
		 */
		if (qh_init(qh_socket_path ? qh_socket_path : DEFAULT_QUERY_SOCKET) != OK) {
			logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: Failed to initialize query handler. Aborting\n");
			exit(EXIT_FAILURE);
		}
		timing_point("Query handler initialized\n");
		nerd_init();
		timing_point("NERD initialized\n");

		/* initialize check workers */
		if (init_workers(num_check_workers) < 0) {
			logit(NSLOG_RUNTIME_ERROR, TRUE, "Failed to spawn workers. Aborting\n");
			exit(EXIT_FAILURE);
		}
		timing_point("%u workers spawned\n", wproc_num_workers_spawned);
		i = 0;
		while (i < 50 && wproc_num_workers_online < wproc_num_workers_spawned) {
			iobroker_poll(nagios_iobs, 50);
			i++;
		}
		timing_point("%u workers connected\n", wproc_num_workers_online);

		/* now that workers have arrived we can set the defaults */
		set_loadctl_defaults();

		/* read in all object config data */
		if (result == OK)
			result = read_all_object_data(config_file);

		/*
		 * the queue has to be initialized befor loading the neb modules
		 * to give them the chance to register user events.
		 * (initializing event queue requires number of objects, so do
		 * this after parsing the objects)
		 */
		init_event_queue();
		timing_point("Event queue initialized\n");

#ifdef USE_EVENT_BROKER
		/* load modules */
		if (neb_load_all_modules() != OK) {
			logit(NSLOG_CONFIG_ERROR, ERROR, "Error: Module loading failed. Aborting.\n");
			/* if we're dumping core, we must remove all dl-files */
			if (daemon_dumps_core)
				neb_unload_all_modules(NEBMODULE_FORCE_UNLOAD, NEBMODULE_NEB_SHUTDOWN);
			exit(EXIT_FAILURE);
		}
		timing_point("Modules loaded\n");

		/* send program data to broker */
		broker_program_state(NEBTYPE_PROCESS_PRELAUNCH, NEBFLAG_NONE, NEBATTR_NONE, NULL);
		timing_point("First callback made\n");
#endif

		/* there was a problem reading the config files */
		if (result != OK)
			logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Bailing out due to one or more errors encountered in the configuration files. Run Naemon from the command line with the -v option to verify your config before restarting. (PID=%d)", (int)getpid());

		else {

			/* run the pre-flight check to make sure everything looks okay*/
			if ((result = pre_flight_check()) != OK)
				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_VERIFICATION_ERROR, TRUE, "Bailing out due to errors encountered while running the pre-flight check.  Run Naemon from the command line with the -v option to verify your config before restarting. (PID=%d)\n", (int)getpid());
		}

		/* an error occurred that prevented us from (re)starting */
		if (result != OK) {

			/* if we were restarting, we need to cleanup from the previous run */
			if (sigrestart == TRUE) {

				/* clean up the status data */
				cleanup_status_data(TRUE);
			}

#ifdef USE_EVENT_BROKER
			/* send program data to broker */
			broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
			cleanup();
			exit(ERROR);
		}

		timing_point("Object configuration parsed and understood\n");

		/* write the objects.cache file */
		fcache_objects(object_cache_file);
		timing_point("Objects cached\n");

#ifdef USE_EVENT_BROKER
		/* send program data to broker */
		broker_program_state(NEBTYPE_PROCESS_START, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

		/* initialize status data unless we're starting */
		if (sigrestart == FALSE) {
			initialize_status_data(config_file);
			timing_point("Status data initialized\n");
		}

		/* initialize scheduled downtime data */
		initialize_downtime_data();
		timing_point("Downtime data initialized\n");

		/* read initial service and host state information  */
		initialize_retention_data(config_file);
		timing_point("Retention data initialized\n");
		read_initial_state_information();
		timing_point("Initial state information read\n");

		/* initialize comment data */
		initialize_comment_data();
		timing_point("Comment data initialized\n");

		/* initialize performance data */
		initialize_performance_data(config_file);
		timing_point("Performance data initialized\n");

		/* initialize the event timing loop */
		init_timing_loop();
		timing_point("Event timing loop initialized\n");

		/* initialize check statistics */
		init_check_stats();
		timing_point("check stats initialized\n");

		/* update all status data (with retained information) */
		update_all_status_data();
		timing_point("Status data updated\n");

		/* log initial host and service state */
		log_host_states(INITIAL_STATES, NULL);
		log_service_states(INITIAL_STATES, NULL);
		timing_point("Initial states logged\n");

		/* reset the restart flag */
		sigrestart = FALSE;

		/* fire up command file worker */
		launch_command_file_worker();
		timing_point("Command file worker launched\n");

#ifdef USE_EVENT_BROKER
		/* send program data to broker */
		broker_program_state(NEBTYPE_PROCESS_EVENTLOOPSTART, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

		/* get event start time and save as macro */
		event_start = time(NULL);
		my_free(mac->x[MACRO_EVENTSTARTTIME]);
		asprintf(&mac->x[MACRO_EVENTSTARTTIME], "%lu", (unsigned long)event_start);

		timing_point("Entering event execution loop\n");
		/***** start monitoring all services *****/
		/* (doesn't return until a restart or shutdown signal is encountered) */
		event_execution_loop();

		/*
		 * immediately deinitialize the query handler so it
		 * can remove modules that have stashed data with it
		 */
		qh_deinit(qh_socket_path ? qh_socket_path : DEFAULT_QUERY_SOCKET);

		/* 03/01/2007 EG Moved from sighandler() to prevent FUTEX locking problems under NPTL */
		/* 03/21/2007 EG SIGSEGV signals are still logged in sighandler() so we don't loose them */
		/* did we catch a signal? */
		if (caught_signal == TRUE) {

			if (sig_id == SIGHUP)
				logit(NSLOG_PROCESS_INFO, TRUE, "Caught SIGHUP, restarting...\n");

		}

#ifdef USE_EVENT_BROKER
		/* send program data to broker */
		broker_program_state(NEBTYPE_PROCESS_EVENTLOOPEND, NEBFLAG_NONE, NEBATTR_NONE, NULL);
		if (sigshutdown == TRUE)
			broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_USER_INITIATED, NEBATTR_SHUTDOWN_NORMAL, NULL);
		else if (sigrestart == TRUE)
			broker_program_state(NEBTYPE_PROCESS_RESTART, NEBFLAG_USER_INITIATED, NEBATTR_RESTART_NORMAL, NULL);
#endif

		disconnect_command_file_worker();

		/* save service and host state information */
		save_state_information(FALSE);
		cleanup_retention_data();

		/* clean up performance data */
		cleanup_performance_data();

		/* clean up the scheduled downtime data */
		cleanup_downtime_data();

		/* clean up the status data unless we're restarting */
		if (sigrestart == FALSE) {
			cleanup_status_data(TRUE);
		}

		free_worker_memory(WPROC_FORCE);
		/* shutdown stuff... */
		if (sigshutdown == TRUE) {
			iobroker_destroy(nagios_iobs, IOBROKER_CLOSE_SOCKETS);
			nagios_iobs = NULL;

			/* log a shutdown message */
			logit(NSLOG_PROCESS_INFO, TRUE, "Successfully shutdown... (PID=%d)\n", (int)getpid());
		}

		/* clean up after ourselves */
		cleanup();

		/* close debug log */
		close_debug_log();

	} while (sigrestart == TRUE && sigshutdown == FALSE);

	if (daemon_mode == TRUE)
		unlink(lock_file);

	/* free misc memory */
	my_free(lock_file);
	my_free(config_file);
	my_free(config_file_dir);
	my_free(naemon_binary_path);

	return OK;
}
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *contact_path, *jobfam_dir;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    char **aliases, *aptr;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /** setup callbacks for abort signals - from this point
     * forward, we need to abort in a manner that allows us
     * to cleanup. However, we cannot directly use libevent
     * to trap these signals as otherwise we cannot respond
     * to them if we are stuck in an event! So instead use
     * the basic POSIX trap functions to handle the signal,
     * and then let that signal handler do some magic to
     * avoid the hang
     *
     * NOTE: posix traps don't allow us to do anything major
     * in them, so use a pipe tied to a libevent event to
     * reach a "safe" place where the termination event can
     * be created
     */
    pipe(term_pipe);
    /* setup an event to attempt normal termination on signal */
    opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL);
    opal_event_set_priority(&term_handler, ORTE_ERROR_PRI);
    opal_event_add(&term_handler, NULL);

    /* Set both ends of this pipe to be close-on-exec so that no
       children inherit it */
    if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS ||
        opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) {
        error = "unable to set the pipe to CLOEXEC";
        goto error;
    }

    /* point the signal trap to a function that will activate that event */
    signal(SIGTERM, abort_signal_callback);
    signal(SIGINT, abort_signal_callback);
    signal(SIGHUP, abort_signal_callback);

    /** setup callbacks for signals we should foward */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback);
    setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback);
    setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif

    /* if we are using xml for output, put an mpirun start tag */
    if (orte_xml_output) {
        fprintf(orte_xml_fp, "<mpirun>\n");
        fflush(orte_xml_fp);
    }

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
  
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Since we are the HNP, then responsibility for
     * defining the name falls to the PLM component for our
     * respective environment - hence, we have to open the PLM
     * first and select that component.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_select";
        goto error;
    }
    /* if we were spawned by a singleton, our jobid was given to us */
    if (NULL != orte_ess_base_jobid) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_string_to_jobid";
            goto error;
        }
        ORTE_PROC_MY_NAME->vpid = 0;
    } else {
        if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_set_hnp_name";
            goto error;
        }
    }
    /* Setup the communication infrastructure */
    
    /*
     * OOB Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }

    /*
     * Runtime Messaging Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }

    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* init the nidmap - just so we register that verbosity */
    orte_util_nidmap_init(NULL);

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    /* mark that the daemons have reported as we are the
     * only ones in the system right now, and we definitely
     * are running!
     */
    jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
   
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;

    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
#if OPAL_HAVE_HWLOC
    /* add it to the array of known topologies */
    opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology);
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    OBJ_RETAIN(node);  /* keep accounting straight */
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);

    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
    node->state = ORTE_NODE_STATE_UP;
    
    /* if we are to retain aliases, get ours */
    if (orte_retain_aliases) {
        aliases = NULL;
        opal_ifgetaliases(&aliases);
        /* add our own local name to it */
        opal_argv_append_nosize(&aliases, orte_process_info.nodename);
        aptr = opal_argv_join(aliases, ',');
        opal_argv_free(aliases);
        orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, aptr, OPAL_STRING);
        free(aptr);
    }

    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;

    /*
     * Routed system
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv("OMPI_MCA_dstore")) {
        putenv("OMPI_MCA_dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handles */
    if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) {
        error = "opal dstore global";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) {
        error = "opal dstore nonpeer";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     */
    if (ORTE_SUCCESS != (ret = orte_plm.init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_init";
        goto error;
    }

    /*
     * Setup the remaining resource
     * management and errmgr frameworks - application procs
     * and daemons do not open these frameworks as they only use
     * the hnp proxy support in the PLM framework.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_ras_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_find_available";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_find_available";
        goto error;
    }
#if OPAL_HAVE_HWLOC
    {
        char *coprocessors, **sns;
        uint32_t h;
        int idx;

        /* if a topology file was given, then the rmaps framework open
         * will have reset our topology. Ensure we always get the right
         * one by setting our node topology afterwards
         */
        node->topology = opal_hwloc_topology;

        /* init the hash table, if necessary */
        if (NULL == orte_coprocessors) {
            orte_coprocessors = OBJ_NEW(opal_hash_table_t);
            opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
        }
        /* detect and add any coprocessors */
        coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
        if (NULL != coprocessors) {
            /* separate the serial numbers of the coprocessors
             * on this host
             */
            sns = opal_argv_split(coprocessors, ',');
            for (idx=0; NULL != sns[idx]; idx++) {
                /* compute the hash */
                OPAL_HASH_STR(sns[idx], h);
                /* mark that this coprocessor is hosted by this node */
                opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid));
            }
            opal_argv_free(sns);
            free(coprocessors);
            orte_coprocessors_detected = true;
        }
        /* see if I am on a coprocessor */
        coprocessors = opal_hwloc_base_check_on_coprocessor();
        if (NULL != coprocessors) {
            orte_set_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING);
            free(coprocessors);
            orte_coprocessors_detected = true;
        }
    }
#endif

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* Open/select the rtc */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* we are an hnp, so update the contact info field for later use */
    orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
    proc->rml_uri = strdup(orte_process_info.my_hnp_uri);

    /* we are also officially a daemon, so better update that field too */
    orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
    
    /* setup the orte_show_help system to recv remote output */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP,
                            ORTE_RML_PERSISTENT, orte_show_help_recv, NULL);

    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        
        /* Once the session directory location has been established, set
           the opal_output hnp file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* save my contact info in a file for others to find */
        jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
        contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
        free(jobfam_dir);
        
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s writing contact file %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             contact_path));
        
        if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s writing contact file failed with error %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_ERROR_NAME(ret)));
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s wrote contact file",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        }
        free(contact_path);
    }

    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }

    /* For HNP, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the dfs framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* if a tool has launched us and is requesting event reports,
     * then set its contact info into the comm system
     */
    if (orte_report_events) {
        if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) {
            error = "could not connect to tool";
            goto error;
        }
    }

    /* We actually do *not* want an HNP to voluntarily yield() the
       processor more than necessary.  Orterun already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.
     
       For example: when a message arrives at orterun, we want the
       OS to wake us up in a timely fashion (which most OS's
       seem good about doing) and then we want orterun to process
       the message as fast as possible.  If orterun yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules orterun to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    
    return ORTE_ERR_SILENT;
}
Exemple #7
0
int
lock_mtab(void)
{
  int tries = 100000, i;
  char *linktargetfile;
  size_t l;

  /*
   * Redhat's original code set a signal handler called "handler()" for all
   * non-ALRM signals.  The handler called unlock_mntlist(), plog'ed the
   * signal name, and then exit(1)!  Never, ever, exit() from inside a
   * utility function.  This messed up Amd's careful signal-handling code,
   * and caused Amd to abort uncleanly only any other "innocent" signal
   * (even simple SIGUSR1), leaving behind a hung Amd mnt point.  That code
   * should have at least restored the signal handlers' states upon a
   * successful mtab unlocking.  Anyway, that handler was unnecessary,
   * because will call unlock_mntlist() properly anyway on exit.
   */
  setup_sighandler(SIGALRM, setlkw_timeout);

  /* somewhat clumsy, but some ancient systems do not have snprintf() */
  /* use 20 as upper bound for the length of %d output */
  l = strlen(MOUNTLOCK_LINKTARGET) + 20;
  linktargetfile = xmalloc(l);
  xsnprintf(linktargetfile, l, MOUNTLOCK_LINKTARGET, getpid());

  i = open(linktargetfile, O_WRONLY|O_CREAT, 0);
  if (i < 0) {
    int errsv = errno;
    /*
     * linktargetfile does not exist (as a file) and we cannot create
     * it. Read-only filesystem?  Too many files open in the system?
     * Filesystem full?
     */
    plog(XLOG_ERROR, "can't create lock file %s: %s (use -n flag to override)",
	 linktargetfile, strerror(errsv));
  }
  close(i);


  /* Repeat until it was us who made the link */
  while (!we_created_lockfile) {
    struct flock flock;
    int errsv, j;

    j = link(linktargetfile, MOUNTED_LOCK);
    errsv = errno;

    if (j < 0 && errsv != EEXIST) {
      (void) unlink(linktargetfile);
      plog(XLOG_ERROR, "can't link lock file %s: %s ",
	   MOUNTED_LOCK, strerror(errsv));
      return 0;
    }

    lockfile_fd = open(MOUNTED_LOCK, O_WRONLY);
    if (lockfile_fd < 0) {
      int errsv = errno;
      /* Strange... Maybe the file was just deleted? */
      if (errno == ENOENT && tries-- > 0) {
	if (tries % 200 == 0)
	  usleep(30);
	continue;
      }
      (void) unlink(linktargetfile);
      plog(XLOG_ERROR,"can't open lock file %s: %s ",
	   MOUNTED_LOCK, strerror(errsv));
      return 0;
    }

    flock.l_type = F_WRLCK;
    flock.l_whence = SEEK_SET;
    flock.l_start = 0;
    flock.l_len = 0;

    if (j == 0) {
      /* We made the link. Now claim the lock. */
      if (fcntl(lockfile_fd, F_SETLK, &flock) == -1) {
	int errsv = errno;
	plog(XLOG_ERROR, "Can't lock lock file %s: %s",
	     MOUNTED_LOCK, strerror(errsv));
	/* proceed, since it was us who created the lockfile anyway */
      }
      we_created_lockfile = 1;
      (void) unlink(linktargetfile);
    } else {
      static int tries = 0;

      /* Someone else made the link. Wait. */
      alarm(LOCK_TIMEOUT);

      if (fcntl(lockfile_fd, F_SETLKW, &flock) == -1) {
	int errsv = errno;
	(void) unlink(linktargetfile);
	plog(XLOG_ERROR, "can't lock lock file %s: %s",
	     MOUNTED_LOCK, (errno == EINTR) ?
	     "timed out" : strerror(errsv));
	return 0;
      }
      alarm(0);
      /*
       * Limit the number of iterations - maybe there
       * still is some old /etc/mtab~
       */
      ++tries;
      if (tries % 200 == 0)
	usleep(30);
      if (tries > 100000) {
	(void) unlink(linktargetfile);
	close(lockfile_fd);
	plog(XLOG_ERROR,
	     "Cannot create link %s; Perhaps there is a stale lock file?",
	     MOUNTED_LOCK);
      }
      close(lockfile_fd);
    }
  }
  return 1;
}
Exemple #8
0
int main(int argc, char **argv, char **env) {
    int result;
    int error = FALSE;
    char *buffer = NULL;
    int display_license = FALSE;
    int display_help = FALSE;
    int c = 0;
    struct tm *tm, tm_s;
    time_t now;
    char datestring[256];
    nagios_macros *mac;

#ifdef HAVE_GETOPT_H
    int option_index = 0;
    static struct option long_options[] = {
        {"help", no_argument, 0, 'h'},
        {"version", no_argument, 0, 'V'},
        {"license", no_argument, 0, 'V'},
        {"verify-config", no_argument, 0, 'v'},
        {"daemon", no_argument, 0, 'd'},
        {"test-scheduling", no_argument, 0, 's'},
        {"precache-objects", no_argument, 0, 'p'},
        {"use-precached-objects", no_argument, 0, 'u'},
        {"enable-timing-point", no_argument, 0, 'T'},
        {0, 0, 0, 0}
    };
#define getopt(argc, argv, o) getopt_long(argc, argv, o, long_options, &option_index)
#endif

    mac = get_global_macros();

    /* make sure we have the correct number of command line arguments */
    if(argc < 2)
        error = TRUE;

    /* get all command line arguments */
    while(1) {
        c = getopt(argc, argv, "+hVvdspuxT");

        if(c == -1 || c == EOF)
            break;

        switch(c) {

        case '?': /* usage */
        case 'h':
            display_help = TRUE;
            break;

        case 'V': /* version */
            display_license = TRUE;
            break;

        case 'v': /* verify */
            verify_config++;
            break;

        case 's': /* scheduling check */
            test_scheduling = TRUE;
            break;

        case 'd': /* daemon mode */
            daemon_mode = TRUE;
            break;

        case 'p': /* precache object config */
            precache_objects = TRUE;
            break;

        case 'u': /* use precached object config */
            use_precached_objects = TRUE;
            break;
        case 'T':
            enable_timing_point = TRUE;
            break;

        case 'x':
            printf("Warning: -x is deprecated and will be removed\n");
            break;

        default:
            break;
        }

    }

#ifdef DEBUG_MEMORY
    mtrace();
#endif

    if(daemon_mode == FALSE) {
        printf("\nNagios Core %s\n", PROGRAM_VERSION);
        printf("Copyright (c) 2009-2011 Nagios Core Development Team and Community Contributors\n");
        printf("Copyright (c) 1999-2009 Ethan Galstad\n");
        printf("Last Modified: %s\n", PROGRAM_MODIFICATION_DATE);
        printf("License: GPL\n\n");
        printf("Website: http://www.nagios.org\n");
    }

    /* just display the license */
    if(display_license == TRUE) {

        printf("This program is free software; you can redistribute it and/or modify\n");
        printf("it under the terms of the GNU General Public License version 2 as\n");
        printf("published by the Free Software Foundation.\n\n");
        printf("This program is distributed in the hope that it will be useful,\n");
        printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
        printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
        printf("GNU General Public License for more details.\n\n");
        printf("You should have received a copy of the GNU General Public License\n");
        printf("along with this program; if not, write to the Free Software\n");
        printf("Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n\n");

        exit(OK);
    }

    /* make sure we got the main config file on the command line... */
    if(optind >= argc)
        error = TRUE;

    /* if there are no command line options (or if we encountered an error), print usage */
    if(error == TRUE || display_help == TRUE) {

        printf("Usage: %s [options] <main_config_file>\n", argv[0]);
        printf("\n");
        printf("Options:\n");
        printf("\n");
        printf("  -v, --verify-config          Verify all configuration data (-v -v for more info)\n");
        printf("  -s, --test-scheduling        Shows projected/recommended check scheduling and other\n");
        printf("                               diagnostic info based on the current configuration files.\n");
        printf("  -T, --enable-timing-point    Enable timed commentary on initialization\n");
        /*printf("  -o, --dont-verify-objects    Don't verify object relationships - USE WITH CAUTION!\n");*/
        printf("  -x, --dont-verify-paths      Don't check for circular object paths - USE WITH CAUTION!\n");
        printf("  -p, --precache-objects       Precache object configuration\n");
        printf("  -u, --use-precached-objects  Use precached object config file\n");
        printf("  -d, --daemon                 Starts Nagios in daemon mode, instead of as a foreground process\n");
        printf("\n");
        printf("Visit the Nagios website at http://www.nagios.org/ for bug fixes, new\n");
        printf("releases, online documentation, FAQs, information on subscribing to\n");
        printf("the mailing lists, and commercial support options for Nagios.\n");
        printf("\n");

        exit(ERROR);
    }


    /* config file is last argument specified */
    config_file = (char *)strdup(argv[optind]);
    if(config_file == NULL) {
        printf("Error allocating memory.\n");
        exit(ERROR);
    }

    /* make sure the config file uses an absolute path */
    if(config_file[0] != '/') {

        /* save the name of the config file */
        buffer = (char *)strdup(config_file);

        /* reallocate a larger chunk of memory */
        config_file = (char *)realloc(config_file, MAX_FILENAME_LENGTH);
        if(config_file == NULL) {
            printf("Error allocating memory.\n");
            exit(ERROR);
        }

        /* get absolute path of current working directory */
        getcwd(config_file, MAX_FILENAME_LENGTH);

        /* append a forward slash */
        strncat(config_file, "/", 1);
        config_file[MAX_FILENAME_LENGTH - 1] = '\x0';

        /* append the config file to the path */
        strncat(config_file, buffer, MAX_FILENAME_LENGTH - strlen(config_file) - 1);
        config_file[MAX_FILENAME_LENGTH - 1] = '\x0';

        my_free(buffer);
    }

    /*
     * let's go to town. We'll be noisy if we're verifying config
     * or running scheduling tests.
     */
    if(verify_config || test_scheduling || precache_objects) {
        reset_variables();

        if(verify_config)
            printf("Reading configuration data...\n");

        /* read our config file */
        result = read_main_config_file(config_file);
        if(result != OK) {
            printf("   Error processing main config file!\n\n");
            exit(EXIT_FAILURE);
        }

        if(verify_config)
            printf("   Read main config file okay...\n");

        /* drop privileges */
        if((result = drop_privileges(nagios_user, nagios_group)) == ERROR) {
            printf("   Failed to drop privileges.  Aborting.");
            exit(EXIT_FAILURE);
        }

        /* read object config files */
        result = read_all_object_data(config_file);
        if(result != OK) {
            printf("   Error processing object config files!\n\n");
            /* if the config filename looks fishy, warn the user */
            if(!strstr(config_file, "nagios.cfg")) {
                printf("\n***> The name of the main configuration file looks suspicious...\n");
                printf("\n");
                printf("     Make sure you are specifying the name of the MAIN configuration file on\n");
                printf("     the command line and not the name of another configuration file.  The\n");
                printf("     main configuration file is typically '/usr/local/nagios/etc/nagios.cfg'\n");
            }

            printf("\n***> One or more problems was encountered while processing the config files...\n");
            printf("\n");
            printf("     Check your configuration file(s) to ensure that they contain valid\n");
            printf("     directives and data defintions.  If you are upgrading from a previous\n");
            printf("     version of Nagios, you should be aware that some variables/definitions\n");
            printf("     may have been removed or modified in this version.  Make sure to read\n");
            printf("     the HTML documentation regarding the config files, as well as the\n");
            printf("     'Whats New' section to find out what has changed.\n\n");
            exit(EXIT_FAILURE);
        }

        if(verify_config) {
            printf("   Read object config files okay...\n\n");
            printf("Running pre-flight check on configuration data...\n\n");
        }

        /* run the pre-flight check to make sure things look okay... */
        result = pre_flight_check();

        if(result != OK) {
            printf("\n***> One or more problems was encountered while running the pre-flight check...\n");
            printf("\n");
            printf("     Check your configuration file(s) to ensure that they contain valid\n");
            printf("     directives and data defintions.  If you are upgrading from a previous\n");
            printf("     version of Nagios, you should be aware that some variables/definitions\n");
            printf("     may have been removed or modified in this version.  Make sure to read\n");
            printf("     the HTML documentation regarding the config files, as well as the\n");
            printf("     'Whats New' section to find out what has changed.\n\n");
            exit(EXIT_FAILURE);
        }

        if(verify_config) {
            printf("\nThings look okay - No serious problems were detected during the pre-flight check\n");
        }

        /* scheduling tests need a bit more than config verifications */
        if(test_scheduling == TRUE) {

            /* we'll need the event queue here so we can time insertions */
            init_event_queue();
            timing_point("Done initializing event queue\n");

            /* read initial service and host state information */
            initialize_retention_data(config_file);
            read_initial_state_information();
            timing_point("Retention data and initial state parsed\n");

            /* initialize the event timing loop */
            init_timing_loop();
            timing_point("Timing loop initialized\n");

            /* display scheduling information */
            display_scheduling_info();
        }

        if(precache_objects) {
            result = fcache_objects(object_precache_file);
            timing_point("Done precaching objects\n");
            if(result == OK) {
                printf("Object precache file created:\n%s\n", object_precache_file);
            }
            else {
                printf("Failed to precache objects to '%s': %s\n", object_precache_file, strerror(errno));
            }
        }

        /* clean up after ourselves */
        cleanup();

        /* exit */
        timing_point("Exiting\n");
        exit(result);
    }


    /* else start to monitor things... */
    else {

        nagios_iobs = iobroker_create();

        /* keep monitoring things until we get a shutdown command */
        do {

            init_event_queue();

            /* reset program variables */
            reset_variables();

            /* get PID */
            nagios_pid = (int)getpid();

            /* read in the configuration files (main and resource config files) */
            result = read_main_config_file(config_file);

            /* NOTE 11/06/07 EG moved to after we read config files, as user may have overridden timezone offset */
            /* get program (re)start time and save as macro */
            program_start = time(NULL);
            my_free(mac->x[MACRO_PROCESSSTARTTIME]);
            asprintf(&mac->x[MACRO_PROCESSSTARTTIME], "%lu", (unsigned long)program_start);

            /* open debug log */
            open_debug_log();

            /* drop privileges */
            if(drop_privileges(nagios_user, nagios_group) == ERROR) {

                logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Failed to drop privileges.  Aborting.");

                cleanup();
                exit(ERROR);
            }

#ifdef USE_EVENT_BROKER
            /* initialize modules */
            neb_init_modules();
            neb_init_callback_list();
#endif

            /* this must be logged after we read config data, as user may have changed location of main log file */
            logit(NSLOG_PROCESS_INFO, TRUE, "Nagios %s starting... (PID=%d)\n", PROGRAM_VERSION, (int)getpid());

            /* log the local time - may be different than clock time due to timezone offset */
            now = time(NULL);
            tm = localtime_r(&now, &tm_s);
            strftime(datestring, sizeof(datestring), "%a %b %d %H:%M:%S %Z %Y", tm);
            logit(NSLOG_PROCESS_INFO, TRUE, "Local time is %s", datestring);

            /* write log version/info */
            write_log_file_info(NULL);

#ifdef USE_EVENT_BROKER
            /* load modules */
            neb_load_all_modules();

            /* send program data to broker */
            broker_program_state(NEBTYPE_PROCESS_PRELAUNCH, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

            /* read in all object config data */
            if(result == OK)
                result = read_all_object_data(config_file);

            /* there was a problem reading the config files */
            if(result != OK)
                logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Bailing out due to one or more errors encountered in the configuration files. Run Nagios from the command line with the -v option to verify your config before restarting. (PID=%d)", (int)getpid());

            else {

                /* run the pre-flight check to make sure everything looks okay*/
                if((result = pre_flight_check()) != OK)
                    logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_VERIFICATION_ERROR, TRUE, "Bailing out due to errors encountered while running the pre-flight check.  Run Nagios from the command line with the -v option to verify your config before restarting. (PID=%d)\n", (int)getpid());
            }

            /* an error occurred that prevented us from (re)starting */
            if(result != OK) {

                /* if we were restarting, we need to cleanup from the previous run */
                if(sigrestart == TRUE) {

                    /* clean up the status data */
                    cleanup_status_data(config_file, TRUE);
                }

#ifdef USE_EVENT_BROKER
                /* send program data to broker */
                broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
                cleanup();
                exit(ERROR);
            }

            /* write the objects.cache file */
            fcache_objects(object_cache_file);

            /* handle signals (interrupts) */
            setup_sighandler();


#ifdef USE_EVENT_BROKER
            /* send program data to broker */
            broker_program_state(NEBTYPE_PROCESS_START, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

            /* enter daemon mode (unless we're restarting...) */
            if(daemon_mode == TRUE && sigrestart == FALSE) {

                result = daemon_init();

                /* we had an error daemonizing, so bail... */
                if(result == ERROR) {
                    logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR, TRUE, "Bailing out due to failure to daemonize. (PID=%d)", (int)getpid());

#ifdef USE_EVENT_BROKER
                    /* send program data to broker */
                    broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
                    cleanup();
                    exit(ERROR);
                }

                asprintf(&buffer, "Finished daemonizing... (New PID=%d)\n", (int)getpid());
                write_to_all_logs(buffer, NSLOG_PROCESS_INFO);
                my_free(buffer);

                /* get new PID */
                nagios_pid = (int)getpid();
            }

            /* initialize status data unless we're starting */
            if(sigrestart == FALSE)
                initialize_status_data(config_file);

            /* read initial service and host state information  */
            initialize_retention_data(config_file);
            read_initial_state_information();

            /* initialize comment data */
            initialize_comment_data(config_file);

            /* initialize scheduled downtime data */
            initialize_downtime_data(config_file);

            /* initialize performance data */
            initialize_performance_data(config_file);

            /* initialize the event timing loop */
            init_timing_loop();

            /* initialize check statistics */
            init_check_stats();

            /* check for updates */
            check_for_nagios_updates(FALSE, TRUE);

            /* update all status data (with retained information) */
            update_all_status_data();

            /* log initial host and service state */
            log_host_states(INITIAL_STATES, NULL);
            log_service_states(INITIAL_STATES, NULL);

            /* reset the restart flag */
            sigrestart = FALSE;

            /* fire up command file worker */
            launch_command_file_worker();

            /* @TODO: get number of workers from config */
            init_workers(4);

#ifdef USE_EVENT_BROKER
            /* send program data to broker */
            broker_program_state(NEBTYPE_PROCESS_EVENTLOOPSTART, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

            /* get event start time and save as macro */
            event_start = time(NULL);
            my_free(mac->x[MACRO_EVENTSTARTTIME]);
            asprintf(&mac->x[MACRO_EVENTSTARTTIME], "%lu", (unsigned long)event_start);

            /***** start monitoring all services *****/
            /* (doesn't return until a restart or shutdown signal is encountered) */
            event_execution_loop();

            /* 03/01/2007 EG Moved from sighandler() to prevent FUTEX locking problems under NPTL */
            /* 03/21/2007 EG SIGSEGV signals are still logged in sighandler() so we don't loose them */
            /* did we catch a signal? */
            if(caught_signal == TRUE) {

                if(sig_id == SIGHUP)
                    logit(NSLOG_PROCESS_INFO, TRUE, "Caught SIGHUP, restarting...\n");

            }

#ifdef USE_EVENT_BROKER
            /* send program data to broker */
            broker_program_state(NEBTYPE_PROCESS_EVENTLOOPEND, NEBFLAG_NONE, NEBATTR_NONE, NULL);
            if(sigshutdown == TRUE)
                broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_USER_INITIATED, NEBATTR_SHUTDOWN_NORMAL, NULL);
            else if(sigrestart == TRUE)
                broker_program_state(NEBTYPE_PROCESS_RESTART, NEBFLAG_USER_INITIATED, NEBATTR_RESTART_NORMAL, NULL);
#endif

            /* save service and host state information */
            save_state_information(FALSE);
            cleanup_retention_data(config_file);

            /* clean up performance data */
            cleanup_performance_data(config_file);

            /* clean up the scheduled downtime data */
            cleanup_downtime_data(config_file);

            /* clean up the status data unless we're restarting */
            if(sigrestart == FALSE) {
                cleanup_status_data(config_file, TRUE);
            }

            /* shutdown stuff... */
            if(sigshutdown == TRUE) {
                free_worker_memory(WPROC_FORCE);
                iobroker_destroy(nagios_iobs, IOBROKER_CLOSE_SOCKETS);

                /* make sure lock file has been removed - it may not have been if we received a shutdown command */
                if(daemon_mode == TRUE)
                    unlink(lock_file);

                /* log a shutdown message */
                logit(NSLOG_PROCESS_INFO, TRUE, "Successfully shutdown... (PID=%d)\n", (int)getpid());
            }

            /* clean up after ourselves */
            cleanup();

            /* close debug log */
            close_debug_log();

        }
        while(sigrestart == TRUE && sigshutdown == FALSE);

        /* free misc memory */
        my_free(config_file);
    }

    return OK;
}
int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;
    char *param;

    plm_in_use = false;

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    
    signals_set = true;
    
#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;
        
        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }
        
        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }
        
        if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif
    
    /* setup the global nidmap/pidmap object */
    orte_nidmap.bytes = NULL;
    orte_nidmap.size = 0;
    orte_pidmap.bytes = NULL;
    orte_pidmap.size = 0;

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    (void) mca_base_var_env_name("plm", &param);
    
    plm_in_use = !!(getenv(param));
    free (param);

    if (plm_in_use)  {

        if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    
    /* Setup the communication infrastructure */
        if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = opal_db_base_select(true))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }
    /* set our id */
    opal_db.set_id((opal_identifier_t*)ORTE_PROC_MY_NAME);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
#if ORTE_ENABLE_STATIC_PORTS
    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* define the routing tree so we know the pattern
         * if we are trying to setup common or static ports
         */
        orte_routed.update_routing_plan();

        /* extract the node info from the environment and
         * build a nidmap from it
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }
#endif
    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan();
    
    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false,
                                    orte_process_info.tmpdir_base,
                                    orte_process_info.top_session_dir,
                                    log_file,
                                    NULL);
            
            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
#if OPAL_HAVE_HWLOC
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    
    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* now point our proc node field to the node */
    OBJ_RETAIN(node);   /* keep accounting straight */
    proc->node = node;

    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;
    
    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }
    
    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Exemple #10
0
static int orcmd_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;

    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;

    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }

    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }

    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);

    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }

    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }

    /* if my name didn't get set, then we didn't find our node
     * in the config - report it and die
     */
    if (NULL == mynode) {
        orte_show_help("help-ess-orcm.txt", "node-not-found", true,
                       orcm_cfgi_base.config_file,
                       orte_process_info.nodename);
        OBJ_DESTRUCT(&buf);
        return ORTE_ERR_SILENT;
    }

    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);

    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);

    /* If we are in test mode, then we don't *require* that a scheduler
     * be defined in the system - otherwise, we do */
    if (NULL == scheduler) {
        if (mca_sst_orcmd_component.scheduler_reqd) {
            error = "no scheduler found";
            ret = ORTE_ERR_NOT_FOUND;
            goto error;
        }
    } else {
        ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
        ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    }

    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);

    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                OBJ_DESTRUCT(&buf);
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }

        /* if we were asked to bind to specific core(s), do so now */
        if (NULL != orte_daemon_cores) {
            char **cores=NULL, tmp[128];
            hwloc_obj_t pu;
            hwloc_cpuset_t ours, pucpus, res;
            int core;

            /* could be a collection of comma-delimited ranges, so
             * use our handy utility to parse it
             */
            orte_util_parse_range_options(orte_daemon_cores, &cores);
            if (NULL != cores) {
                ours = hwloc_bitmap_alloc();
                hwloc_bitmap_zero(ours);
                pucpus = hwloc_bitmap_alloc();
                res = hwloc_bitmap_alloc();
                for (i=0; NULL != cores[i]; i++) {
                    core = strtoul(cores[i], NULL, 10);
                    if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                        orte_show_help("help-orted.txt", "orted:cannot-bind",
                                       true, orte_process_info.nodename,
                                       orte_daemon_cores);
                        ret = ORTE_ERR_NOT_SUPPORTED;
                        OBJ_DESTRUCT(&buf);
                        error = "cannot bind";
                        goto error;
                    }
                    hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                    hwloc_bitmap_or(res, ours, pucpus);
                    hwloc_bitmap_copy(ours, res);
                }
                /* if the result is all zeros, then don't bind */
                if (!hwloc_bitmap_iszero(ours)) {
                    (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                    if (opal_hwloc_report_bindings) {
                        opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                        opal_output(0, "Daemon %s is bound to cores %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                    }
                }
                /* cleanup */
                hwloc_bitmap_free(ours);
                hwloc_bitmap_free(pucpus);
                hwloc_bitmap_free(res);
                opal_argv_free(cores);
            }
        }
    }
#endif

    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the notifier */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_open";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (!opal_list_get_size(&orte_oob_base.actives)) {
        ret = ORTE_ERROR;
        error = "orte_oob: Found 0 active transports";
        goto error;
    }

    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }

    /* select the notifier*/
    if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_select";
        goto error;
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }

    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }

    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) {
        putenv(OPAL_MCA_PREFIX"dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);

    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }

    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* setup the ANALYTICS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_analytics_base_open";
        goto error;
    }

    /* setup the EVGEN framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_select";
        goto error;
    }

    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orcm_sensor.start(ORTE_PROC_MY_NAME->jobid);

    /* setup the PWRMGMT framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_select";
        goto error;
    }

    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* open and setup the DIAG framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_base_open";
        goto error;
    }
    if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orcm-runtime.txt",
                   "orcm_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Exemple #11
0
static void on_sigint(int sig)
{
    setup_sighandler(SIGINT, 0, NULL);
    at_exit();
    kill(getpid(), sig);
}
Exemple #12
0
static int tool_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;
    
    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;
    
    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }
    
    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }
    
    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }
    
    /* define a name for myself */
    if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_set_hnp_name";
        goto error;
    }
    
    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);
    
    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);
    
    ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
    ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    
    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }
    
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;
    
    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    
    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }
    
    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv("OMPI_MCA_dstore")) {
        putenv("OMPI_MCA_dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handles */
    if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) {
        error = "opal dstore global";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) {
        error = "opal dstore nonpeer";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_util_nidmap_init";
        goto error;
    }
    
    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);
    
    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /* construct the thread object */
    OBJ_CONSTRUCT(&progress_thread, opal_thread_t);
    /* fork off a thread to progress it */
    progress_thread.t_run = progress_thread_engine;
    progress_thread_running = true;
    if (OPAL_SUCCESS != (ret = opal_thread_start(&progress_thread))) {
        error = "progress thread start";
        progress_thread_running = false;
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }
    
    return ORTE_SUCCESS;
    
error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Exemple #13
0
/* Following main() declaration required by older versions of Perl ut 5.00503 */
int main(int argc, char **argv, char **env) {
	int result;
	int error = FALSE;
	char *buffer = NULL;
	int display_license = FALSE;
	int display_help = FALSE;
	int c = 0;
	struct tm *tm, tm_s;
	time_t now;
	char datestring[256];
	icinga_macros *mac;

#ifdef HAVE_GETOPT_H
	int option_index = 0;
	static struct option long_options[] = {
		{"help", no_argument, 0, 'h'},
		{"version", no_argument, 0, 'V'},
		{"license", no_argument, 0, 'V'},
		{"verify-config", no_argument, 0, 'v'},
		{"daemon", no_argument, 0, 'd'},
		{"test-scheduling", no_argument, 0, 's'},
		{"show-scheduling", no_argument, 0, 'S'},
		{"dont-verify-objects", no_argument, 0, 'o'},
		{"dont-verify-paths", no_argument, 0, 'x'},
		{"precache-objects", no_argument, 0, 'p'},
		{"use-precached-objects", no_argument, 0, 'u'},
		{0, 0, 0, 0}
	};
#endif

	mac = get_global_macros();

	/* make sure we have the correct number of command line arguments */
	if (argc < 2)
		error = TRUE;


	/* get all command line arguments */
	while (1) {

#ifdef HAVE_GETOPT_H
		c = getopt_long(argc, argv, "+hVvdsoxpuS", long_options, &option_index);
#else
		c = getopt(argc, argv, "+hVvdsoxpuS");
#endif

		if (c == -1 || c == EOF)
			break;

		switch (c) {

		case '?': /* usage */
		case 'h':
			display_help = TRUE;
			break;

		case 'V': /* version */
			display_license = TRUE;
			break;

		case 'v': /* verify */
			verify_config++;
			break;

		case 's': /* scheduling check */
			test_scheduling = TRUE;
			break;

		case 'S': /* scheduling check and show queue*/
			show_schedule = TRUE;
			test_scheduling = TRUE;
			break;

		case 'd': /* daemon mode */
			daemon_mode = TRUE;
			break;

		case 'o': /* don't verify objects */
			/*
			verify_object_relationships=FALSE;
			*/
			break;

		case 'x': /* don't verify circular paths */
			verify_circular_paths = FALSE;
			break;

		case 'p': /* precache object config */
			precache_objects = TRUE;
			break;

		case 'u': /* use precached object config */
			use_precached_objects = TRUE;
			break;

		default:
			break;
		}

	}

	/* make sure we have the right combination of arguments */
	if (precache_objects == TRUE && (test_scheduling == FALSE && verify_config == 0)) {
		error = TRUE;
		display_help = TRUE;
	}

#ifdef DEBUG_MEMORY
	mtrace();
#endif

	if (daemon_mode == FALSE) {
		printf("\n%s %s\n", PROGRAM_NAME , PROGRAM_VERSION);
		printf("Copyright (c) 2009-present Icinga Development Team (http://www.icinga.org)\n");
		printf("Copyright (c) 2009-2013 Nagios Core Development Team and Community Contributors\n");
		printf("Copyright (c) 1999-2009 Ethan Galstad\n");
		printf("Last Modified: %s\n", PROGRAM_MODIFICATION_DATE);
		printf("License: GPL\n\n");
	}

	/* just display the license */
	if (display_license == TRUE) {

		printf("This program is free software; you can redistribute it and/or modify\n");
		printf("it under the terms of the GNU General Public License version 2 as\n");
		printf("published by the Free Software Foundation.\n\n");
		printf("This program is distributed in the hope that it will be useful,\n");
		printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
		printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
		printf("GNU General Public License for more details.\n\n");
		printf("You should have received a copy of the GNU General Public License\n");
		printf("along with this program; if not, write to the Free Software\n");
		printf("Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.\n\n");

		exit(OK);
	}

	/* make sure we got the main config file on the command line... */
	if (optind >= argc)
		error = TRUE;

	/* if there are no command line options (or if we encountered an error), print usage */
	if (error == TRUE || display_help == TRUE) {

		printf("Usage: %s [options] <main_config_file>\n", argv[0]);
		printf("\n");
		printf("Options:\n");
		printf("\n");
		printf("  -v, --verify-config          Verify all configuration data\n");
		printf("  -s, --test-scheduling        Shows projected/recommended check scheduling and other\n");
		printf("                               diagnostic info based on the current configuration files.\n");
		printf("  -S, --show-scheduling        Same as -s, but also show the scheduling queue\n");
		/*printf("  -o, --dont-verify-objects    Don't verify object relationships - USE WITH CAUTION!\n");*/
		printf("  -x, --dont-verify-paths      Don't check for circular object paths - USE WITH CAUTION!\n");
		printf("  -p, --precache-objects       Precache object configuration - use with -v or -s options\n");
		printf("  -u, --use-precached-objects  Use precached object config file\n");
		printf("  -d, --daemon                 Starts %s in daemon mode, instead of as a foreground process\n", PROGRAM_NAME);
		printf("\n");
		printf("Visit the %s website at http://www.icinga.org/ for bug fixes, new\n", PROGRAM_NAME);
		printf("releases, online documentation, FAQs, information on subscribing to\n");
		printf("the mailing lists, and commercial support options for %s.\n", PROGRAM_NAME);
		printf("\n");

		exit(ERROR);
	}


	/* config file is last argument specified */
	config_file = (char *)strdup(argv[optind]);
	if (config_file == NULL) {
		printf("Error allocating memory.\n");
		exit(ERROR);
	}

	/* make sure the config file uses an absolute path */
	if (config_file[0] != '/') {

		/* save the name of the config file */
		buffer = (char *)strdup(config_file);

		/* reallocate a larger chunk of memory */
		config_file = (char *)realloc(config_file, MAX_FILENAME_LENGTH);
		if (config_file == NULL) {
			printf("Error allocating memory.\n");
			exit(ERROR);
		}

		/* get absolute path of current working directory */
		getcwd(config_file, MAX_FILENAME_LENGTH);

		/* append a forward slash */
		strncat(config_file, "/", 1);
		config_file[MAX_FILENAME_LENGTH-1] = '\x0';

		/* append the config file to the path */
		strncat(config_file, buffer, MAX_FILENAME_LENGTH - strlen(config_file) - 1);
		config_file[MAX_FILENAME_LENGTH-1] = '\x0';

		my_free(buffer);
	}


	/* we're just verifying the configuration... */
	if (verify_config) {

		/* reset program variables */
		reset_variables();

		printf("Reading configuration data...\n");

		/* read in the configuration files (main config file, resource and object config files) */
		if ((result = read_main_config_file(config_file)) == OK) {

			printf("   Read main config file okay...\n");

			/* drop privileges */
			if ((result = drop_privileges(nagios_user, nagios_group)) == ERROR)
				printf("   Failed to drop privileges.  Aborting.");
			else {
				/* read object config files */
				if ((result = read_all_object_data(config_file)) == OK)
					printf("   Read object config files okay...\n");
				else
					printf("   Error processing object config files!\n");
			}
		} else
			printf("   Error processing main config file!\n\n");

		printf("\n");

		/* there was a problem reading the config files */
		if (result != OK) {

			/* if the config filename looks fishy, warn the user */
			if (!strstr(config_file, "nagios.cfg") && !strstr(config_file, "icinga.cfg")) {
				printf("\n***> The name of the main configuration file looks suspicious...\n");
				printf("\n");
				printf("     Make sure you are specifying the name of the MAIN configuration file on\n");
				printf("     the command line and not the name of another configuration file.  The\n");
				printf("     main configuration file is typically '/usr/local/icinga/etc/icinga.cfg'\n");
				printf("     or if using packages, most likely '/etc/icinga/icinga.cfg'\n");
			}

			printf("\n***> One or more problems was encountered while processing the config files...\n");
			printf("\n");
			printf("     Check your configuration file(s) to ensure that they contain valid\n");
			printf("     directives and data definitions.  If you are upgrading from a previous\n");
			printf("     version of %s, you should be aware that some variables/definitions\n", PROGRAM_NAME);
			printf("     may have been removed or modified in this version.  Make sure to read\n");
			printf("     the HTML documentation regarding the config files, as well as the\n");
			printf("     'Whats New' section and the Changelog CHANGES section as well to find\n");
			printf("     out what has changed.\n\n");
		}

		/* the config files were okay, so run the pre-flight check */
		else {

			printf("Running pre-flight check on configuration data...\n\n");

			/* run the pre-flight check to make sure things look okay... */
			result = pre_flight_check();

			if (result == OK)
				printf("\nThings look okay - No serious problems were detected during the pre-flight check\n");
			else {
				printf("\n***> One or more problems was encountered while running the pre-flight check...\n");
				printf("\n");
				printf("     Check your configuration file(s) to ensure that they contain valid\n");
				printf("     directives and data definitions.  If you are upgrading from a previous\n");
				printf("     version of %s, you should be aware that some variables/definitions\n", PROGRAM_NAME);
				printf("     may have been removed or modified in this version.  Make sure to read\n");
				printf("     the HTML documentation regarding the config files, as well as the\n");
				printf("     'Whats New' section to find out what has changed.\n\n");
			}
		}

		/* clean up after ourselves */
		cleanup();

		/* free config_file */
		my_free(config_file);

		/* exit */
		exit(result);
	}


	/* we're just testing scheduling... */
	else if (test_scheduling == TRUE) {

		/* reset program variables */
		reset_variables();

		/* read in the configuration files (main config file and all host config files) */
		result = read_main_config_file(config_file);

		/* drop privileges */
		if (result == OK)
			if ((result = drop_privileges(nagios_user, nagios_group)) == ERROR)
				printf("Failed to drop privileges.  Aborting.");

		/* read object config files */
		if (result == OK)
			result = read_all_object_data(config_file);

		/* read initial service and host state information */
		if (result == OK) {
			initialize_retention_data(config_file);
			read_initial_state_information();
		}

		if (result != OK)
			printf("***> One or more problems was encountered while reading configuration data...\n");

		/* run the pre-flight check to make sure everything looks okay */
		if (result == OK) {
			if ((result = pre_flight_check()) != OK)
				printf("***> One or more problems was encountered while running the pre-flight check...\n");
		}

		if (result == OK) {

			/* initialize the event timing loop */
			init_timing_loop();

			/* display schedule */
			if (show_schedule == TRUE)
				display_schedule();

			/* display scheduling information */
			display_scheduling_info();

			if (precache_objects == TRUE) {
				printf("\n");
				printf("OBJECT PRECACHING\n");
				printf("-----------------\n");
				printf("Object config files were precached.\n");
			}
		}

#undef TEST_TIMEPERIODS
#ifdef TEST_TIMEPERIODS
		/* DO SOME TIMEPERIOD TESTING - ADDED 08/11/2009 */
		time_t now, pref_time, valid_time;
		timeperiod *tp;
		tp = find_timeperiod("247_exclusion");
		time(&now);
		pref_time = now;
		get_next_valid_time(pref_time, &valid_time, tp);
		printf("=====\n");
		printf("CURRENT:   %lu = %s", (unsigned long)now, ctime(&now));
		printf("PREFERRED: %lu = %s", (unsigned long)pref_time, ctime(&pref_time));
		printf("NEXT:      %lu = %s", (unsigned long)valid_time, ctime(&valid_time));
		printf("=====\n");
#endif

		/* clean up after ourselves */
		cleanup();

		/* exit */
		exit(result);
	}


	/* else start to monitor things... */
	else {

		/* keep monitoring things until we get a shutdown command */
		do {

			/* reset program variables */
			reset_variables();

			/* get PID */
			nagios_pid = (int)getpid();

			/* read in the configuration files (main and resource config files) */
			if (read_main_config_file(config_file) == ERROR) {
				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Failed to read main config file.  Aborting.");
				cleanup();
				exit(EXIT_FAILURE);
			}

			/* we need to read the modules in the first place as object configuration before neb modules are initialized/loaded */
			result = read_object_config_data(config_file, READ_MODULES, FALSE, FALSE);

			/* add modules to neb */
			result = add_module_objects_to_neb();

			/* NOTE 11/06/07 EG moved to after we read config files, as user may have overridden timezone offset */
			/* get program (re)start time and save as macro */
			program_start = time(NULL);
			my_free(mac->x[MACRO_PROCESSSTARTTIME]);
			asprintf(&mac->x[MACRO_PROCESSSTARTTIME], "%lu", (unsigned long)program_start);

			/* open debug log */
			open_debug_log();

			/* drop privileges */
			if (drop_privileges(nagios_user, nagios_group) == ERROR) {

				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Failed to drop privileges.  Aborting.");

				cleanup();
				exit(ERROR);
			}

#ifdef USE_EVENT_BROKER
			/* initialize modules */
			neb_init_modules();
			neb_init_callback_list();
#endif

			/* this must be logged after we read config data, as user may have changed location of main log file */
			logit(NSLOG_PROCESS_INFO, TRUE, "%s %s starting... (PID=%d)\n", PROGRAM_NAME, PROGRAM_VERSION, (int)getpid());

			/* log the local time - may be different than clock time due to timezone offset */
			now = time(NULL);
			tm = localtime_r(&now, &tm_s);
			strftime(datestring, sizeof(datestring), "%a %b %d %H:%M:%S %Z %Y", tm);
			logit(NSLOG_PROCESS_INFO, TRUE, "Local time is %s", datestring);

			/* write log version/info */
			write_log_file_info(NULL);

#ifdef USE_EVENT_BROKER
			/* load modules */
			if (neb_load_all_modules() != OK) {
                logit(NSLOG_CONFIG_ERROR, ERROR, "Error: NEB module loading failed. Aborting.\n");

                if (daemon_dumps_core)
                    neb_unload_all_modules(NEBMODULE_FORCE_UNLOAD, NEBMODULE_NEB_SHUTDOWN);

                exit(EXIT_FAILURE);
            }

			/* forcibly send a program status update
			 * for later updates of PROCESS_* */
			broker_program_status(NEBTYPE_PROGRAMSTATUS_UPDATE, NEBFLAG_NONE, NEBATTR_NONE, NULL);

			/* send program data to broker */
			broker_program_state(NEBTYPE_PROCESS_PRELAUNCH, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

			/* read in all object config data */
			if (result == OK)
				result = read_all_object_data(config_file);

			/* there was a problem reading the config files */
			if (result != OK)
				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_CONFIG_ERROR, TRUE, "Bailing out due to one or more errors encountered in the configuration files. Run %s from the command line with the -v option to verify your config before restarting. (PID=%d)", PROGRAM_NAME , (int)getpid());

			else {

				/* run the pre-flight check to make sure everything looks okay*/
				if ((result = pre_flight_check()) != OK)
					logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR | NSLOG_VERIFICATION_ERROR, TRUE, "Bailing out due to errors encountered while running the pre-flight check.  Run %s from the command line with the -v option to verify your config before restarting. (PID=%d)\n", PROGRAM_NAME , (int)getpid());
			}

			/* an error occurred that prevented us from (re)starting */
			if (result != OK) {

				/* if we were restarting, we need to cleanup from the previous run */
				if (sigrestart == TRUE) {

					/* clean up the status data */
					cleanup_status_data(config_file, TRUE);

					/* shutdown the external command worker thread */
					shutdown_command_file_worker_thread();

					/* close and delete the external command file FIFO */
					close_command_file();

					/* cleanup embedded perl interpreter */
					if (embedded_perl_initialized == TRUE)
						deinit_embedded_perl();
				}

#ifdef USE_EVENT_BROKER
				/* send program data to broker */
				broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
				cleanup();
				exit(ERROR);
			}



			/* initialize embedded Perl interpreter */
			/* NOTE 02/15/08 embedded Perl must be initialized if compiled in, regardless of whether or not its enabled in the config file */
			/* It compiled it, but not initialized, Icinga will segfault in readdir() calls, as libperl takes this function over */
			if (embedded_perl_initialized == FALSE) {
				/*				if(enable_embedded_perl==TRUE){*/
#ifdef EMBEDDEDPERL
				init_embedded_perl(env);
#else
				init_embedded_perl(NULL);
#endif
				embedded_perl_initialized = TRUE;
				/*					}*/
			}

			/* handle signals (interrupts) */
			setup_sighandler();


#ifdef USE_EVENT_BROKER
			/* send program data to broker */
			broker_program_state(NEBTYPE_PROCESS_START, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

			/* enter daemon mode (unless we're restarting...) */
			if (daemon_mode == TRUE && sigrestart == FALSE) {

				result = daemon_init();

				/* we had an error daemonizing, so bail... */
				if (result == ERROR) {
					logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR, TRUE, "Bailing out due to failure to daemonize. (PID=%d)", (int)getpid());

#ifdef USE_EVENT_BROKER
					/* send program data to broker */
					broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
					cleanup();
					exit(ERROR);
				}

				asprintf(&buffer, "Finished daemonizing... (New PID=%d)\n", (int)getpid());
				write_to_all_logs(buffer, NSLOG_PROCESS_INFO);
				my_free(buffer);

				/* get new PID */
				nagios_pid = (int)getpid();
			}

			/* open the command file (named pipe) for reading */
			result = open_command_file();
			if (result != OK) {

				logit(NSLOG_PROCESS_INFO | NSLOG_RUNTIME_ERROR, TRUE, "Bailing out due to errors encountered while trying to initialize the external command file... (PID=%d)\n", (int)getpid());

#ifdef USE_EVENT_BROKER
				/* send program data to broker */
				broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_PROCESS_INITIATED, NEBATTR_SHUTDOWN_ABNORMAL, NULL);
#endif
				cleanup();
				exit(ERROR);
			}

			/* initialize status data unless we're starting */
			if (sigrestart == FALSE)
				initialize_status_data(config_file);

			/* read initial service and host state information  */
			initialize_retention_data(config_file);
			read_initial_state_information();
			sync_state_information();

			/* initialize comment data */
			initialize_comment_data(config_file);

			/* initialize scheduled downtime data */
			initialize_downtime_data(config_file);

			/* initialize performance data */
			initialize_performance_data(config_file);

			/* initialize the event timing loop */
			init_timing_loop();

			/* initialize check statistics */
			init_check_stats();

			/* update all status data (with retained information) */
			update_all_status_data();

			/* log initial host and service state */
			log_host_states(INITIAL_STATES, NULL);
			log_service_states(INITIAL_STATES, NULL);

			/* reset the restart flag */
			sigrestart = FALSE;

#ifdef USE_EVENT_BROKER
			/* send program data to broker */
			broker_program_state(NEBTYPE_PROCESS_EVENTLOOPSTART, NEBFLAG_NONE, NEBATTR_NONE, NULL);
#endif

			/* get event start time and save as macro */
			event_start = time(NULL);
			my_free(mac->x[MACRO_EVENTSTARTTIME]);
			asprintf(&mac->x[MACRO_EVENTSTARTTIME], "%lu", (unsigned long)event_start);

			/* print event loop start */
			asprintf(&buffer, "Event loop started...\n");
			write_to_all_logs(buffer, NSLOG_PROCESS_INFO);
			my_free(buffer);

			/***** start monitoring all services *****/
			/* (doesn't return until a restart or shutdown signal is encountered) */
			event_execution_loop();

			/* 03/01/2007 EG Moved from sighandler() to prevent FUTEX locking problems under NPTL */
			/* 03/21/2007 EG SIGSEGV signals are still logged in sighandler() so we don't loose them */
			/* did we catch a signal? */
			if (caught_signal == TRUE) {

				if (sig_id == SIGHUP)
					asprintf(&buffer, "Caught SIGHUP, restarting...\n");
				else if (sig_id != SIGSEGV)
					asprintf(&buffer, "Caught SIG%s, shutting down...\n", sigs[sig_id]);

				write_to_all_logs(buffer, NSLOG_PROCESS_INFO);
				my_free(buffer);
			}

#ifdef USE_EVENT_BROKER
			/* send program data to broker */
			broker_program_state(NEBTYPE_PROCESS_EVENTLOOPEND, NEBFLAG_NONE, NEBATTR_NONE, NULL);
			if (sigshutdown == TRUE)
				broker_program_state(NEBTYPE_PROCESS_SHUTDOWN, NEBFLAG_USER_INITIATED, NEBATTR_SHUTDOWN_NORMAL, NULL);
			else if (sigrestart == TRUE)
				broker_program_state(NEBTYPE_PROCESS_RESTART, NEBFLAG_USER_INITIATED, NEBATTR_RESTART_NORMAL, NULL);
#endif

			/* save service and host state information */
			save_state_information(FALSE);
			cleanup_retention_data(config_file);

			/* clean up performance data */
			cleanup_performance_data(config_file);

			/* clean up the scheduled downtime data */
			cleanup_downtime_data(config_file);

			/* clean up the comment data */
			cleanup_comment_data(config_file);

			/* clean up the status data unless we're restarting */
			if (sigrestart == FALSE)
				cleanup_status_data(config_file, TRUE);

			/* close and delete the external command file FIFO unless we're restarting */
			if (sigrestart == FALSE) {
				shutdown_command_file_worker_thread();
				close_command_file();
			}

			/* cleanup embedded perl interpreter */
			if (sigrestart == FALSE)
				deinit_embedded_perl();

			/* shutdown stuff... */
			if (sigshutdown == TRUE) {

				/* make sure lock file has been removed - it may not have been if we received a shutdown command */
				if (daemon_mode == TRUE)
					unlink(lock_file);

				/* log a shutdown message */
				logit(NSLOG_PROCESS_INFO, TRUE, "Successfully shutdown... (PID=%d)\n", (int)getpid());
			}

			/* clean up after ourselves */
			cleanup();

			/* close debug log */
			close_debug_log();

		} while (sigrestart == TRUE && sigshutdown == FALSE);

		/* free misc memory */
		my_free(config_file);
	}

	return OK;
}