Exemple #1
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{"no-history", no_argument, NULL, 'H'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(SUCCESS);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(SUCCESS);
		}
	}

	while ((c = getopt_long(argc, argv, "f:vH", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		case 'H': /* no-history */
			only_one_entry_desired = true;
			break;
		default:
			usage();
			exit(ERR_BAD_CONFIG);
		}
	}

	setup_cancel_handler();

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, &local_options);
	if (local_options.node == -1)
	{
		log_err("Node information is missing. "
		        "Check the configuration file, or provide one if you have not done so.\n");
		exit(ERR_BAD_CONFIG);
	}

	logger_init(progname, local_options.loglevel, local_options.logfacility);
	if (verbose)
		logger_min_verbose(LOG_INFO);

	snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name);

	log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo);
	myLocalConn = establishDBConnection(local_options.conninfo, true);

	/* should be v9 or better */
	log_info(_("%s Connected to database, checking its state\n"), progname);
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(myLocalConn);
		log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		exit(ERR_BAD_CONFIG);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE;
	if (myLocalMode == PRIMARY_MODE)
	{
		primary_options.node = local_options.node;
		strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
		primaryConn = myLocalConn;
	}
	else
	{
		/* I need the id of the primary as well as a connection to it */
		log_info(_("%s Connecting to primary for cluster '%s'\n"),
		         progname, local_options.cluster_name);
		primaryConn = getMasterConnection(myLocalConn, local_options.node,
		                                  local_options.cluster_name,
		                                  &primary_options.node,NULL);
		if (primaryConn == NULL)
		{
			CloseConnections();
			exit(ERR_BAD_CONFIG);
		}
	}

	checkClusterConfiguration(myLocalConn,primaryConn);
	checkNodeConfiguration(local_options.conninfo);
	if (myLocalMode == STANDBY_MODE)
	{
		log_info(_("%s Starting continuous standby node monitoring\n"), progname);
		MonitorCheck();
	}
	else
	{
		log_info(_("%s This is a primary node, program not needed here; exiting'\n"), progname);
	}

	/* Prevent a double-free */
	if (primaryConn == myLocalConn)
		myLocalConn = NULL;

	/* close the connection to the database and cleanup */
	CloseConnections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}
Exemple #2
0
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
MonitorExecute(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	for (connection_retries = 0; connection_retries < 15; connection_retries++)
	{
		if (PQstatus(primaryConn) != CONNECTION_OK)
		{
			log_warning(_("Connection to master has been lost, trying to recover...\n"));
			/* wait 20 seconds between retries */
			sleep(20);

			PQreset(primaryConn);
		}
		else
		{
			if (connection_retries > 0)
			{
				log_notice(_("Connection to master has been restored, continue monitoring.\n"));
			}
			break;
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
		for (connection_retries = 0; connection_retries < 6; connection_retries++)
		{
			primaryConn = getMasterConnection(myLocalConn, local_options.node,
			                                  local_options.cluster_name, &primary_options.node,NULL);
			if (PQstatus(primaryConn) == CONNECTION_OK)
			{
				/* Connected, we can continue the process so break the loop */
				log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
				break;
			}
			else
			{
				log_err(_("We haven't found a new master, waiting before retry...\n"));
				/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
				sleep(300);
			}
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		log_err(_("We couldn't reconnect for long enough, exiting...\n"));
		exit(ERR_DB_CON);
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
		CloseConnections();
		exit(ERR_PROMOTED);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sqlquery_snprintf(
	    sqlquery,
	    "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	    "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN);
	PQclear(res);

	/* Get primary xlog info */
	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err("PQexec failed: %s\n", PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	if (only_one_entry && only_one_entry_desired)
	{
		sqlquery_snprintf(sqlquery,
		                  "UPDATE %s.repl_monitor "
		                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
		                  " '%s', '%s', "
		                  " %lld, %lld)"
		                  "WHERE primary_node=%d AND secondary_node=%d", repmgr_schema,
		                  primary_options.node, local_options.node, monitor_standby_timestamp,
		                  last_wal_primary_location,
		                  last_wal_standby_received,
		                  (lsn_primary - lsn_standby_received),
		                  (lsn_standby_received - lsn_standby_applied));
		res = PQexec(primaryConn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_err("PQexec failed: %s\n", PQerrorMessage(conn));
			PQclear(res);
			CloseConnections();
			exit(ERR_DB_QUERY);
		}
		if (PQntuples(res) != 1)
		{
			only_one_entry = false;
		}
		PQclear(res);
	}
	else
	{
		/*
		 * Build and send insert
		 */
		sqlquery_snprintf(sqlquery,
		                  "INSERT INTO %s.repl_monitor "
		                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
		                  " '%s', '%s', "
		                  " %lld, %lld)", repmgr_schema,
		                  primary_options.node, local_options.node, monitor_standby_timestamp,
		                  last_wal_primary_location,
		                  last_wal_standby_received,
		                  (lsn_primary - lsn_standby_received),
		                  (lsn_standby_received - lsn_standby_applied));
		res = PQexec(primaryConn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_err("PQexec failed: %s\n", PQerrorMessage(conn));
			PQclear(res);
			CloseConnections();
			exit(ERR_DB_QUERY);
		}
		PQclear(res);

		if (only_one_entry_desired)
		{
			/*
			 * Build the SQL to execute on primary
			 */
			sqlquery_snprintf(sqlquery,
			                  "DELETE FROM %s.repl_monitor "
			                  "WHERE primary_node=%d AND standby_node=%d AND last_monitor_time < '%s'::timestamp with time zone",
			                  repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp);
			res = PQexec(primaryConn, sqlquery);
			if (PQresultStatus(res) != PGRES_TUPLES_OK)
			{
				log_err("PQexec failed: %s\n", PQerrorMessage(conn));
				PQclear(res);
				CloseConnections();
				exit(ERR_DB_QUERY);
			}
			PQclear(res);
			only_one_entry = true;
		}
	}
}
Exemple #3
0
static void
do_standby_clone(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];

	int			r = 0;
	int			i;
	bool		pg_dir = false;
	char		master_data_directory[MAXLEN];
	char		master_config_file[MAXLEN];
	char		master_hba_file[MAXLEN];
	char		master_ident_file[MAXLEN];

	char		master_control_file[MAXLEN];
	char		local_control_file[MAXLEN];

	const char	*first_wal_segment = NULL;
	const char	*last_wal_segment = NULL;

	char	master_version[MAXVERSIONSTR];

	/* if dest_dir hasn't been provided, initialize to current directory */
	if (dest_dir == NULL)
	{
		dest_dir = malloc(5);
		strcpy(dest_dir, ".");
	}

	/* Check this directory could be used as a PGDATA dir */
	switch (check_dir(dest_dir))
	{
	case 0:
		/* dest_dir not there, must create it */
		if (verbose)
			printf(_("creating directory %s ... "), dest_dir);
		fflush(stdout);

		if (!create_directory(dest_dir))
		{
			fprintf(stderr, _("%s: couldn't create directory %s ... "),
			        progname, dest_dir);
			return;
		}
		break;
	case 1:
		/* Present but empty, fix permissions and use it */
		if (verbose)
			printf(_("fixing permissions on existing directory %s ... "),
			       dest_dir);
		fflush(stdout);

		if (!set_directory_permissions(dest_dir))
		{
			fprintf(stderr, _("%s: could not change permissions of directory \"%s\": %s\n"),
			        progname, dest_dir, strerror(errno));
			return;
		}
		break;
	case 2:
		/* Present and not empty */
		fprintf(stderr,
		        _("%s: directory \"%s\" exists but is not empty\n"),
		        progname, dest_dir);

		pg_dir = is_pg_dir(dest_dir);
		if (pg_dir && !force)
		{
			fprintf(stderr, _("\nThis looks like a PostgreSQL directroy.\n"
			                  "If you are sure you want to clone here, "
			                  "please check there is no PostgreSQL server "
			                  "running and use the --force option\n"));
			return;
		}
		else if (pg_dir && force)
		{
			/* Let it continue */
			break;
		}
		else
			return;
	default:
		/* Trouble accessing directory */
		fprintf(stderr, _("%s: could not access directory \"%s\": %s\n"),
		        progname, dest_dir, strerror(errno));
	}

	/* Connection parameters for master only */
	keywords[0] = "host";
	values[0] = host;
	keywords[1] = "port";
	values[1] = masterport;

	/* We need to connect to check configuration and start a backup */
	conn = PQconnectdbParams(keywords, values, true);
	if (!conn)
	{
		fprintf(stderr, _("%s: could not connect to master\n"),
		        progname);
		return;
	}

	/* primary should be v9 or better */
	pg_version(conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are cloning a primary node */
	if (is_standby(conn))
	{
		PQfinish(conn);
		fprintf(stderr, "\nThe command should clone a primary node\n");
		return;
	}

	/* And check if it is well configured */
	if (!guc_setted(conn, "wal_level", "=", "hot_standby"))
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs parameter 'wal_level' to be set to 'hot_standby'\n"), progname);
		return;
	}
	if (!guc_setted(conn, "wal_keep_segments", ">=", wal_keep_segments))
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs parameter 'wal_keep_segments' to be set to %s or greater\n"), wal_keep_segments, progname);
		return;
	}
	if (!guc_setted(conn, "archive_mode", "=", "on"))
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs parameter 'archive_mode' to be set to 'on'\n"), progname);
		return;
	}

	if (verbose)
		printf(_("Succesfully connected to primary. Current installation size is %s\n"), get_cluster_size(conn));

	/* Check if the tablespace locations exists and that we can write to them */
	sprintf(sqlquery, "select spclocation from pg_tablespace where spcname not in ('pg_default', 'pg_global')");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about tablespaces: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	for (i = 0; i < PQntuples(res); i++)
	{
		char *tblspc_dir = NULL;

		strcpy(tblspc_dir, PQgetvalue(res, i, 0));
		/* Check this directory could be used as a PGDATA dir */
		switch (check_dir(tblspc_dir))
		{
		case 0:
			/* tblspc_dir not there, must create it */
			if (verbose)
				printf(_("creating directory \"%s\"... "), tblspc_dir);
			fflush(stdout);

			if (!create_directory(tblspc_dir))
			{
				fprintf(stderr, _("%s: couldn't create directory \"%s\"... "),
				        progname, tblspc_dir);
				PQclear(res);
				PQfinish(conn);
				return;
			}
			break;
		case 1:
			/* Present but empty, fix permissions and use it */
			if (verbose)
				printf(_("fixing permissions on existing directory \"%s\"... "),
				       tblspc_dir);
			fflush(stdout);

			if (!set_directory_permissions(tblspc_dir))
			{
				fprintf(stderr, _("%s: could not change permissions of directory \"%s\": %s\n"),
				        progname, tblspc_dir, strerror(errno));
				PQclear(res);
				PQfinish(conn);
				return;
			}
			break;
		case 2:
			/* Present and not empty */
			if (!force)
			{
				fprintf(stderr,
				        _("%s: directory \"%s\" exists but is not empty\n"),
				        progname, tblspc_dir);
				PQclear(res);
				PQfinish(conn);
				return;
			}
		default:
			/* Trouble accessing directory */
			fprintf(stderr, _("%s: could not access directory \"%s\": %s\n"),
			        progname, tblspc_dir, strerror(errno));
			PQclear(res);
			PQfinish(conn);
			return;
		}
	}

	fprintf(stderr, "Starting backup...\n");

	/* Get the data directory full path and the configuration files location */
	sprintf(sqlquery, "SELECT name, setting "
	        "  FROM pg_settings "
	        " WHERE name IN ('data_directory', 'config_file', 'hba_file', 'ident_file')");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about data directory and configuration files: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	for (i = 0; i < PQntuples(res); i++)
	{
		if (strcmp(PQgetvalue(res, i, 0), "data_directory") == 0)
			strcpy(master_data_directory, PQgetvalue(res, i, 1));
		else if (strcmp(PQgetvalue(res, i, 0), "config_file") == 0)
			strcpy(master_config_file, PQgetvalue(res, i, 1));
		else if (strcmp(PQgetvalue(res, i, 0), "hba_file") == 0)
			strcpy(master_hba_file, PQgetvalue(res, i, 1));
		else if (strcmp(PQgetvalue(res, i, 0), "ident_file") == 0)
			strcpy(master_ident_file, PQgetvalue(res, i, 1));
		else
			fprintf(stderr, _("uknown parameter: %s"), PQgetvalue(res, i, 0));
	}
	PQclear(res);

	/*
	 * inform the master we will start a backup and get the first XLog filename
	 * so we can say to the user we need those files
	 */
	sprintf(sqlquery, "SELECT pg_xlogfile_name(pg_start_backup('repmgr_standby_clone_%ld'))", time(NULL));
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't start backup: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	first_wal_segment = PQgetvalue(res, 0, 0);
	PQclear(res);

	/*
	 * 1) first move global/pg_control
	 *
	 * 2) then move data_directory ommiting the files we have already moved and pg_xlog
	 *    content
	 *
	 * 3) finally We need to backup configuration files (that could be on other directories, debian
	 * like systems likes to do that), so look at config_file, hba_file and ident_file but we
	 * can omit external_pid_file ;)
	 *
	 * On error we need to return but before that execute pg_stop_backup()
	 */

	/* need to create the global sub directory */
	sprintf(master_control_file, "%s/global/pg_control", master_data_directory);
	sprintf(local_control_file, "%s/global", dest_dir);
	if (!create_directory(local_control_file))
	{
		fprintf(stderr, _("%s: couldn't create directory %s ... "),
		        progname, dest_dir);
		goto stop_backup;
	}

	r = copy_remote_files(host, remote_user, master_control_file, local_control_file, false);
	if (r != 0)
		goto stop_backup;

	r = copy_remote_files(host, remote_user, master_data_directory, dest_dir, true);
	if (r != 0)
		goto stop_backup;

	/*
	 * Copy tablespace locations, i'm doing this separately because i couldn't find and appropiate
	 * rsync option but besides we could someday make all these rsync happen concurrently
	 */
	sprintf(sqlquery, "select spclocation from pg_tablespace where spcname not in ('pg_default', 'pg_global')");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about tablespaces: %s\n", PQerrorMessage(conn));
		PQclear(res);
		goto stop_backup;
	}
	for (i = 0; i < PQntuples(res); i++)
	{
		r = copy_remote_files(host, remote_user, PQgetvalue(res, i, 0), PQgetvalue(res, i, 0), true);
		if (r != 0)
			goto stop_backup;
	}

	r = copy_remote_files(host, remote_user, master_config_file, dest_dir, false);
	if (r != 0)
		goto stop_backup;

	r = copy_remote_files(host, remote_user, master_hba_file, dest_dir, false);
	if (r != 0)
		goto stop_backup;

	r = copy_remote_files(host, remote_user, master_ident_file, dest_dir, false);
	if (r != 0)
		goto stop_backup;

stop_backup:
	/* inform the master that we have finished the backup */
	conn = PQconnectdbParams(keywords, values, true);
	if (!conn)
	{
		fprintf(stderr, _("%s: could not connect to master\n"),
		        progname);
		return;
	}

	fprintf(stderr, "Finishing backup...\n");

	sprintf(sqlquery, "SELECT pg_xlogfile_name(pg_stop_backup())");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't stop backup: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	last_wal_segment = PQgetvalue(res, 0, 0);
	PQclear(res);
	PQfinish(conn);

	/* Now, if the rsync failed then exit */
	if (r != 0)
		return;

	if (verbose)
		printf(_("%s requires primary to keep WAL files %s until at least %s\n"),
		       progname, first_wal_segment, last_wal_segment);

	/* we need to create the pg_xlog sub directory too, i'm reusing a variable here */
	sprintf(local_control_file, "%s/pg_xlog", dest_dir);
	if (!create_directory(local_control_file))
	{
		fprintf(stderr, _("%s: couldn't create directory %s, you will need to do it manually...\n"),
		        progname, dest_dir);
	}

	/* Finally, write the recovery.conf file */
	create_recovery_file(dest_dir);

	/* We don't start the service because we still may want to move the directory */
	return;
}
Exemple #4
0
static void
do_standby_promote(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];
	char 		script[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	PGconn		*old_master_conn;
	int			old_master_id;

	int			r;
	char		data_dir[MAXLEN];
	char		recovery_file_path[MAXLEN];
	char		recovery_done_path[MAXLEN];

	char	standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	/* We need to connect to check configuration */
	conn = establishDBConnection(conninfo, true);

	/* we need v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are in a standby node */
	if (!is_standby(conn))
	{
		fprintf(stderr, "repmgr: The command should be executed in a standby node\n");
		return;
	}

	/* we also need to check if there isn't any master already */
	old_master_conn = getMasterConnection(conn, myLocalId, myClusterName, &old_master_id);
	if (old_master_conn != NULL)
	{
		PQfinish(old_master_conn);
		fprintf(stderr, "There is a master already in this cluster");
		return;
	}

	if (verbose)
		printf(_("\n%s: Promoting standby...\n"), progname);

	/* Get the data directory full path and the last subdirectory */
	sprintf(sqlquery, "SELECT setting "
	        " FROM pg_settings WHERE name = 'data_directory'");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	strcpy(data_dir, PQgetvalue(res, 0, 0));
	PQclear(res);
	PQfinish(conn);

	sprintf(recovery_file_path, "%s/%s", data_dir, RECOVERY_FILE);
	sprintf(recovery_done_path, "%s/%s", data_dir, RECOVERY_DONE_FILE);
	rename(recovery_file_path, recovery_done_path);

	/* We assume the pg_ctl script is in the PATH */
	sprintf(script, "pg_ctl -D %s -m fast restart", data_dir);
	r = system(script);
	if (r != 0)
	{
		fprintf(stderr, "Can't restart service\n");
		return;
	}

	/* reconnect to check we got promoted */
	/*
	 * XXX i'm removing this because it gives an annoying message saying couldn't connect
	 * but is just the server starting up
	*    conn = establishDBConnection(conninfo, true);
	*    if (is_standby(conn))
	*    	fprintf(stderr, "\n%s: STANDBY PROMOTE failed, this is still a standby node.\n", progname);
	*    else
	*    	fprintf(stderr, "\n%s: you should REINDEX any hash indexes you have.\n", progname);
	*    PQfinish(conn);
	*/

	return;
}
Exemple #5
0
static void
do_master_register(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	bool		schema_exists = false;
	char master_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	conn = establishDBConnection(conninfo, true);

	/* master should be v9 or better */
	pg_version(conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are a master */
	if (is_standby(conn))
	{
		fprintf(stderr, "repmgr: This node should be a master\n");
		PQfinish(conn);
		return;
	}

	/* Check if there is a schema for this cluster */
	sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName);
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about schemas: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}

	if (PQntuples(res) > 0)			/* schema exists */
	{
		if (!force)					/* and we are not forcing so error */
		{
			fprintf(stderr, "Schema repmgr_%s already exists.", myClusterName);
			PQclear(res);
			PQfinish(conn);
			return;
		}
		schema_exists = true;
	}
	PQclear(res);

	if (!schema_exists)
	{
		/* ok, create the schema */
		sprintf(sqlquery, "CREATE SCHEMA repmgr_%s", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the schema repmgr_%s: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		/* ... the tables */
		sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_nodes (        "
		        "  id        integer primary key, "
		        "  cluster   text    not null,    "
		        "  conninfo  text    not null)", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the table repmgr_%s.repl_nodes: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_monitor ( "
		        "  primary_node                   INTEGER NOT NULL, "
		        "  standby_node                   INTEGER NOT NULL, "
		        "  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL, "
		        "  last_wal_primary_location      TEXT NOT NULL,   "
		        "  last_wal_standby_location      TEXT NOT NULL,   "
		        "  replication_lag                BIGINT NOT NULL, "
		        "  apply_lag                      BIGINT NOT NULL) ", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the table repmgr_%s.repl_monitor: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		/* and the view */
		sprintf(sqlquery, "CREATE VIEW repmgr_%s.repl_status AS "
		        "  WITH monitor_info AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY primary_node, standby_node "
		        " ORDER BY last_monitor_time desc) "
		        "  FROM repmgr_%s.repl_monitor) "
		        "  SELECT primary_node, standby_node, last_monitor_time, last_wal_primary_location, "
		        "         last_wal_standby_location, pg_size_pretty(replication_lag) replication_lag, "
		        "         pg_size_pretty(apply_lag) apply_lag, age(now(), last_monitor_time) AS time_lag "
		        "    FROM monitor_info a "
		        "   WHERE row_number = 1", myClusterName, myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the view repmgr_%s.repl_status: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}
	}
	else
	{
		PGconn *master_conn;
		int 	id;

		/* Ensure there isn't any other master already registered */
		master_conn = getMasterConnection(conn, myLocalId, myClusterName, &id);
		if (master_conn != NULL)
		{
			PQfinish(master_conn);
			fprintf(stderr, "There is a master already in this cluster");
			return;
		}
	}

	/* Now register the master */
	if (force)
	{
		sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes "
		        " WHERE id = %d",
		        myClusterName, myLocalId);

		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot delete node details, %s\n",
			        PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}
	}

	sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes "
	        "VALUES (%d, '%s', '%s')",
	        myClusterName, myLocalId, myClusterName, conninfo);

	if (!PQexec(conn, sqlquery))
	{
		fprintf(stderr, "Cannot insert node details, %s\n",
		        PQerrorMessage(conn));
		PQfinish(conn);
		return;
	}

	PQfinish(conn);
	return;
}
Exemple #6
0
static void
do_standby_register(void)
{
	PGconn 		*conn;
	PGconn		*master_conn;
	int			master_id;

	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	char master_version[MAXVERSIONSTR];
	char standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	conn = establishDBConnection(conninfo, true);

	/* should be v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are a standby */
	if (!is_standby(conn))
	{
		fprintf(stderr, "repmgr: This node should be a standby\n");
		PQfinish(conn);
		return;
	}

	/* Check if there is a schema for this cluster */
	sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName);
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about tablespaces: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}

	if (PQntuples(res) == 0)		/* schema doesn't exists */
	{
		fprintf(stderr, "Schema repmgr_%s doesn't exists.", myClusterName);
		PQclear(res);
		PQfinish(conn);
		return;
	}
	PQclear(res);

	/* check if there is a master in this cluster */
	master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id);
	if (!master_conn)
		return;

	/* master should be v9 or better */
	pg_version(master_conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* master and standby version should match */
	if (strcmp(master_version, standby_version) != 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"),
		        progname, master_version, standby_version);
		return;
	}


	/* Now register the standby */
	if (force)
	{
		sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes "
		        " WHERE id = %d",
		        myClusterName, myLocalId);

		if (!PQexec(master_conn, sqlquery))
		{
			fprintf(stderr, "Cannot delete node details, %s\n",
			        PQerrorMessage(master_conn));
			PQfinish(master_conn);
			PQfinish(conn);
			return;
		}
	}

	sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes "
	        "VALUES (%d, '%s', '%s')",
	        myClusterName, myLocalId, myClusterName, conninfo);

	if (!PQexec(master_conn, sqlquery))
	{
		fprintf(stderr, "Cannot insert node details, %s\n",
		        PQerrorMessage(master_conn));
		PQfinish(master_conn);
		PQfinish(conn);
		return;
	}

	PQfinish(master_conn);
	PQfinish(conn);
	return;
}
Exemple #7
0
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
standby_monitor(void)
{
	PGresult   *res;
	char		monitor_standby_timestamp[MAXLEN];
	char		last_wal_primary_location[MAXLEN];
	char		last_wal_standby_received[MAXLEN];
	char		last_wal_standby_applied[MAXLEN];
	char		last_wal_standby_applied_timestamp[MAXLEN];
	char		sqlquery[QUERY_STR_LEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int			connection_retries,
				ret;
	bool		did_retry = false;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	check_connection(primary_conn, "master");	/* this take up to
												 * local_options.reconnect_atte
												 * mpts *
												 * local_options.reconnect_intv
												 * l seconds */

	if (!check_connection(my_local_conn, "standby"))
	{
		log_err("Failed to connect to local node, exiting!\n");
		terminate(1);
	}

	if (PQstatus(primary_conn) != CONNECTION_OK)
	{
		PQfinish(primary_conn);
		primary_conn = NULL;

		if (local_options.failover == MANUAL_FAILOVER)
		{
			log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
			for (connection_retries = 0; connection_retries < 6; connection_retries++)
			{
				primary_conn = get_master_connection(my_local_conn, repmgr_schema,
					local_options.cluster_name, &primary_options.node, NULL);
				if (PQstatus(primary_conn) == CONNECTION_OK)
				{
					/*
					 * Connected, we can continue the process so break the
					 * loop
					 */
					log_err(_("Connected to node %d, continue monitoring.\n"),
							primary_options.node);
					break;
				}
				else
				{
					log_err(_("We haven't found a new master, waiting before retry...\n"));

					/*
					 * wait local_options.retry_promote_interval_secs minutes
					 * before retries, after 6 failures (6 *
					 * local_options.monitor_interval_secs seconds) we stop
					 * trying
					 */
					sleep(local_options.retry_promote_interval_secs);
				}
			}

			if (PQstatus(primary_conn) != CONNECTION_OK)
			{
				log_err(_("We couldn't reconnect for long enough, exiting...\n"));
				terminate(ERR_DB_CON);
			}
		}
		else if (local_options.failover == AUTOMATIC_FAILOVER)
		{
			/*
			 * When we returns from this function we will have a new primary
			 * and a new primary_conn
			 */
			do_failover();
			return;
		}
	}

	/* Check if we still are a standby, we could have been promoted */
	do
	{
		ret = is_standby(my_local_conn);

		switch (ret)
		{
			case 0:
				log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
				terminate(1);
				break;

			case -1:
				log_err(_("Standby node disappeared, trying to reconnect...\n"));
				did_retry = true;

				if (!check_connection(my_local_conn, "standby"))
				{
					terminate(0);
				}

				break;
		}
	} while (ret == -1);

	if (did_retry)
	{
		log_info(_("standby connection got back up again!\n"));
	}

	/* Fast path for the case where no history is requested */
	if (!monitoring_history)
		return;

	/*
	 * Cancel any query that is still being executed, so i can insert the
	 * current record
	 */
	if (!cancel_query(primary_conn, local_options.master_response_timeout))
		return;
	if (wait_connection_availability(primary_conn, local_options.master_response_timeout) != 1)
		return;

	/* Get local xlog info */
	sqlquery_snprintf(
					  sqlquery,
				"SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
		  "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp()");

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
	strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
	PQclear(res);

	/* Get primary xlog info */
	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primary_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(primary_conn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = wal_location_to_bytes(last_wal_primary_location);
	lsn_standby_received = wal_location_to_bytes(last_wal_standby_received);
	lsn_standby_applied = wal_location_to_bytes(last_wal_standby_applied);

	/*
	 * Build the SQL to execute on primary
	 */
	sqlquery_snprintf(sqlquery,
					  "INSERT INTO %s.repl_monitor "
					  "VALUES(%d, %d, '%s'::timestamp with time zone, "
					  " '%s'::timestamp with time zone, '%s', '%s', "
					  " %lld, %lld)", repmgr_schema,
		 primary_options.node, local_options.node, monitor_standby_timestamp,
					  last_wal_standby_applied_timestamp,
					  last_wal_primary_location,
					  last_wal_standby_received,
					  (lsn_primary - lsn_standby_received),
					  (lsn_standby_received - lsn_standby_applied));

	/*
	 * Execute the query asynchronously, but don't check for a result. We will
	 * check the result next time we pause for a monitor step.
	 */
	log_debug("standby_monitor: %s\n", sqlquery);
	if (PQsendQuery(primary_conn, sqlquery) == 0)
		log_warning(_("Query could not be sent to primary. %s\n"),
					PQerrorMessage(primary_conn));
}
Exemple #8
0
static void
do_standby_follow(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];
	char 		script[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	PGconn		*master_conn;
	int			master_id;

	int			r;
	char		data_dir[MAXLEN];

	char	master_version[MAXVERSIONSTR];
	char	standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	/* We need to connect to check configuration */
	conn = establishDBConnection(conninfo, true);

	/* Check we are in a standby node */
	if (!is_standby(conn))
	{
		fprintf(stderr, "\n%s: The command should be executed in a standby node\n", progname);
		return;
	}

	/* should be v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("\n%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* we also need to check if there is any master in the cluster */
	master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id);
	if (master_conn == NULL)
	{
		PQfinish(conn);
		fprintf(stderr, "There isn't a master to follow in this cluster");
		return;
	}

	/* Check we are going to point to a master */
	if (is_standby(master_conn))
	{
		PQfinish(conn);
		fprintf(stderr, "%s: The node to follow should be a master\n", progname);
		return;
	}

	/* should be v9 or better */
	pg_version(master_conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* master and standby version should match */
	if (strcmp(master_version, standby_version) != 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"),
		        progname, master_version, standby_version);
		return;
	}

	/*
	 * set the host and masterport variables with the master ones
	 * before closing the connection because we will need them to
	 * recreate the recovery.conf file
	 */
	host = malloc(20);
	masterport = malloc(10);
	strcpy(host, PQhost(master_conn));
	strcpy(masterport, PQport(master_conn));
	PQfinish(master_conn);

	if (verbose)
		printf(_("\n%s: Changing standby's master...\n"), progname);

	/* Get the data directory full path */
	sprintf(sqlquery, "SELECT setting "
	        " FROM pg_settings WHERE name = 'data_directory'");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	strcpy(data_dir, PQgetvalue(res, 0, 0));
	PQclear(res);
	PQfinish(conn);

	/* write the recovery.conf file */
	if (!create_recovery_file(data_dir))
		return;

	/* Finally, restart the service */
	/* We assume the pg_ctl script is in the PATH */
	sprintf(script, "pg_ctl -D %s -m fast restart", data_dir);
	r = system(script);
	if (r != 0)
	{
		fprintf(stderr, "Can't restart service\n");
		return;
	}

	return;
}
Exemple #9
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config-file", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{"monitoring-history", no_argument, NULL, 'm'},
		{"daemonize", no_argument, NULL, 'd'},
		{"pid-file", required_argument, NULL, 'p'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c,
				ret;
	bool		daemonize = false;
	FILE	   *fd;

	char		standby_version[MAXVERSIONSTR],
			   *ret_ver;

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(SUCCESS);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
			exit(SUCCESS);
		}
	}

	while ((c = getopt_long(argc, argv, "f:v:mdp:", long_options, &optindex)) != -1)
	{
		switch (c)
		{
			case 'f':
				config_file = optarg;
				break;
			case 'v':
				verbose = true;
				break;
			case 'm':
				monitoring_history = true;
				break;
			case 'd':
				daemonize = true;
				break;
			case 'p':
				pid_file = optarg;
				break;
			default:
				usage();
				exit(ERR_BAD_CONFIG);
		}
	}

	if (daemonize)
	{
		do_daemonize();
	}

	if (pid_file)
	{
		check_and_create_pid_file(pid_file);
	}

#ifndef WIN32
	setup_event_handlers();
#endif

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, &local_options);
	if (local_options.node == -1)
	{
		log_err(_("Node information is missing. "
				  "Check the configuration file, or provide one if you have not done so.\n"));
		terminate(ERR_BAD_CONFIG);
	}

	fd = freopen("/dev/null", "r", stdin);
	if (fd == NULL)
	{
		fprintf(stderr, "error reopening stdin to '/dev/null': %s",
				strerror(errno));
	}

	fd = freopen("/dev/null", "w", stdout);
	if (fd == NULL)
	{
		fprintf(stderr, "error reopening stdout to '/dev/null': %s",
				strerror(errno));
	}

	logger_init(&local_options, progname, local_options.loglevel,
				local_options.logfacility);
	if (verbose)
		logger_min_verbose(LOG_INFO);

	if (log_type == REPMGR_SYSLOG)
	{
		fd = freopen("/dev/null", "w", stderr);

		if (fd == NULL)
		{
			fprintf(stderr, "error reopening stderr to '/dev/null': %s",
					strerror(errno));
		}
	}

	xsnprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
			 local_options.cluster_name);

	log_info(_("%s Connecting to database '%s'\n"), progname,
			 local_options.conninfo);
	my_local_conn = establish_db_connection(local_options.conninfo, true);

	/* should be v9 or better */
	log_info(_("%s Connected to database, checking its state\n"), progname);
	ret_ver = pg_version(my_local_conn, standby_version);
	if (ret_ver == NULL || strcmp(standby_version, "") == 0)
	{
		if (ret_ver != NULL)
			log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"),
					progname);
		terminate(ERR_BAD_CONFIG);
	}


	/*
	 * MAIN LOOP This loops cicles once per failover and at startup
	 * Requisites: - my_local_conn needs to be already setted with an active
	 * connection - no master connection
	 */
	do
	{
		/*
		 * Set my server mode, establish a connection to primary and start
		 * monitor
		 */
		ret = is_witness(my_local_conn, repmgr_schema,
						 local_options.cluster_name, local_options.node);

		if (ret == 1)
			my_local_mode = WITNESS_MODE;
		else if (ret == 0)
		{
			ret = is_standby(my_local_conn);

			if (ret == 1)
				my_local_mode = STANDBY_MODE;
			else if (ret == 0)	/* is the master */
				my_local_mode = PRIMARY_MODE;
		}

		/*
		 * XXX we did this before changing is_standby() to return int; we
		 * should not exit at this point, but for now we do until we have a
		 * better strategy
		 */
		if (ret == -1)
			terminate(1);

		switch (my_local_mode)
		{
			case PRIMARY_MODE:
				primary_options.node = local_options.node;
				strncpy(primary_options.conninfo, local_options.conninfo,
						MAXLEN);
				primary_conn = my_local_conn;

				check_cluster_configuration(my_local_conn);
				check_node_configuration();

				if (reload_config(config_file, &local_options))
				{
					PQfinish(my_local_conn);
					my_local_conn = establish_db_connection(local_options.conninfo, true);
					primary_conn = my_local_conn;
					update_registration();
				}

				log_info(_("%s Starting continuous primary connection check\n"),
						 progname);

				/*
				 * Check that primary is still alive, and standbies are
				 * sending info
				 */

				/*
				 * Every local_options.monitor_interval_secs seconds, do
				 * master checks XXX Check that standbies are sending info
				 */
				do
				{
					if (check_connection(primary_conn, "master"))
					{
						/*
						 * CheckActiveStandbiesConnections();
						 * CheckInactiveStandbies();
						 */
						sleep(local_options.monitor_interval_secs);
					}
					else
					{
						/*
						 * XXX May we do something more verbose ?
						 */
						terminate(1);
					}

					if (got_SIGHUP)
					{
						/*
						 * if we can reload, then could need to change
						 * my_local_conn
						 */
						if (reload_config(config_file, &local_options))
						{
							PQfinish(my_local_conn);
							my_local_conn = establish_db_connection(local_options.conninfo, true);
							primary_conn = my_local_conn;

							if (*local_options.logfile)
							{
								FILE	   *fd;

								fd = freopen(local_options.logfile, "a", stderr);
								if (fd == NULL)
								{
									fprintf(stderr, "error reopening stderr to '%s': %s",
									 local_options.logfile, strerror(errno));
								}

							}

							update_registration();
						}
						got_SIGHUP = false;
					}
				} while (!failover_done);
				break;

			case WITNESS_MODE:
			case STANDBY_MODE:
				/* I need the id of the primary as well as a connection to it */
				log_info(_("%s Connecting to primary for cluster '%s'\n"),
						 progname, local_options.cluster_name);
				primary_conn = get_master_connection(my_local_conn, repmgr_schema,
												  local_options.cluster_name,
												&primary_options.node, NULL);
				if (primary_conn == NULL)
				{
					terminate(ERR_BAD_CONFIG);
				}

				check_cluster_configuration(my_local_conn);
				check_node_configuration();

				if (reload_config(config_file, &local_options))
				{
					PQfinish(my_local_conn);
					my_local_conn = establish_db_connection(local_options.conninfo, true);
					update_registration();
				}

				/*
				 * Every local_options.monitor_interval_secs seconds, do
				 * checks
				 */
				if (my_local_mode == WITNESS_MODE)
				{
					log_info(_("%s Starting continuous witness node monitoring\n"),
							 progname);
				}
				else if (my_local_mode == STANDBY_MODE)
				{
					log_info(_("%s Starting continuous standby node monitoring\n"),
							 progname);
				}

				do
				{
					if (my_local_mode == WITNESS_MODE)
						witness_monitor();
					else if (my_local_mode == STANDBY_MODE)
						standby_monitor();
					sleep(local_options.monitor_interval_secs);

					if (got_SIGHUP)
					{
						/*
						 * if we can reload, then could need to change
						 * my_local_conn
						 */
						if (reload_config(config_file, &local_options))
						{
							PQfinish(my_local_conn);
							my_local_conn = establish_db_connection(local_options.conninfo, true);
							update_registration();
						}
						got_SIGHUP = false;
					}
				} while (!failover_done);
				break;
			default:
				log_err(_("%s: Unrecognized mode for node %d\n"), progname,
						local_options.node);
		}

		failover_done = false;

	} while (true);

	/* close the connection to the database and cleanup */
	close_connections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}
Exemple #10
0
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
StandbyMonitor(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds

	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		if (local_options.failover == MANUAL_FAILOVER)
		{
			log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
			for (connection_retries = 0; connection_retries < 6; connection_retries++)
			{
				primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node,
				                                  local_options.cluster_name, &primary_options.node, NULL);
				if (PQstatus(primaryConn) == CONNECTION_OK)
				{
					/* Connected, we can continue the process so break the loop */
					log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
					break;
				}
				else
				{
					log_err(_("We haven't found a new master, waiting before retry...\n"));
					/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
					sleep(300);
				}
			}

			if (PQstatus(primaryConn) != CONNECTION_OK)
			{
				log_err(_("We couldn't reconnect for long enough, exiting...\n"));
				exit(ERR_DB_CON);
			}
		}
		else if (local_options.failover == AUTOMATIC_FAILOVER)
		{
			/*
			 * When we returns from this function we will have a new primary and
			 * a new primaryConn
			 */
			do_failover();
		}
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
		CloseConnections();
		exit(ERR_PROMOTED);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sqlquery_snprintf(
	    sqlquery,
	    "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	    "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN);
	PQclear(res);

	/* Get primary xlog info */
	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	/*
	 * Build the SQL to execute on primary
	 */
	sqlquery_snprintf(sqlquery,
	                  "INSERT INTO %s.repl_monitor "
	                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
	                  " '%s', '%s', "
	                  " %lld, %lld)", repmgr_schema,
	                  primary_options.node, local_options.node, monitor_standby_timestamp,
	                  last_wal_primary_location,
	                  last_wal_standby_received,
	                  (lsn_primary - lsn_standby_received),
	                  (lsn_standby_received - lsn_standby_applied));

	/*
	 * Execute the query asynchronously, but don't check for a result. We
	 * will check the result next time we pause for a monitor step.
	 */
	if (PQsendQuery(primaryConn, sqlquery) == 0)
		log_warning(_("Query could not be sent to primary. %s\n"),
		            PQerrorMessage(primaryConn));
}
Exemple #11
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(SUCCESS);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(SUCCESS);
		}
	}

	while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		default:
			usage();
			exit(ERR_BAD_CONFIG);
		}
	}

	setup_event_handlers();

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, &local_options);
	if (local_options.node == -1)
	{
		log_err(_("Node information is missing. "
		          "Check the configuration file, or provide one if you have not done so.\n"));
		exit(ERR_BAD_CONFIG);
	}

	logger_init(progname, local_options.loglevel, local_options.logfacility);
	if (verbose)
		logger_min_verbose(LOG_INFO);

	snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name);

	log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo);
	myLocalConn = establishDBConnection(local_options.conninfo, true);

	/* should be v9 or better */
	log_info(_("%s Connected to database, checking its state\n"), progname);
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		PQfinish(myLocalConn);
		exit(ERR_BAD_CONFIG);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
		myLocalMode = WITNESS_MODE;
	else if (is_standby(myLocalConn))
		myLocalMode = STANDBY_MODE;
	else /* is the master */
		myLocalMode = PRIMARY_MODE;

	switch (myLocalMode)
	{
	case PRIMARY_MODE:
		primary_options.node = local_options.node;
		strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
		primaryConn = myLocalConn;

		checkClusterConfiguration(myLocalConn, primaryConn);
		checkNodeConfiguration(local_options.conninfo);

		if (reload_configuration(config_file, &local_options))
		{
			PQfinish(myLocalConn);
			myLocalConn = establishDBConnection(local_options.conninfo, true);
			primaryConn = myLocalConn;
			update_registration();
		}

		log_info(_("%s Starting continuous primary connection check\n"), progname);
		/* Check that primary is still alive, and standbies are sending info */
		/*
		 * Every SLEEP_MONITOR seconds, do master checks
		 * XXX
		 * Check that standbies are sending info
		*/
		for (;;)
		{
			if (CheckPrimaryConnection())
			{
				/*
									CheckActiveStandbiesConnections();
									CheckInactiveStandbies();
				*/
				sleep(SLEEP_MONITOR);
			}
			else
			{
				/* XXX
				 * May we do something more verbose ?
				 */
				exit (1);
			}

			if (got_SIGHUP)
			{
				/* if we can reload, then could need to change myLocalConn */
				if (reload_configuration(config_file, &local_options))
				{
					PQfinish(myLocalConn);
					myLocalConn = establishDBConnection(local_options.conninfo, true);
					primaryConn = myLocalConn;
					update_registration();
				}
				got_SIGHUP = false;
			}
		}
		break;
	case WITNESS_MODE:
	case STANDBY_MODE:
		/* I need the id of the primary as well as a connection to it */
		log_info(_("%s Connecting to primary for cluster '%s'\n"),
		         progname, local_options.cluster_name);
		primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node,
		                                  local_options.cluster_name,
		                                  &primary_options.node, NULL);
		if (primaryConn == NULL)
		{
			CloseConnections();
			exit(ERR_BAD_CONFIG);
		}

		checkClusterConfiguration(myLocalConn, primaryConn);
		checkNodeConfiguration(local_options.conninfo);

		if (reload_configuration(config_file, &local_options))
		{
			PQfinish(myLocalConn);
			myLocalConn = establishDBConnection(local_options.conninfo, true);
			update_registration();
		}

		/*
		 * Every SLEEP_MONITOR seconds, do checks
		 */
		if (myLocalMode == WITNESS_MODE)
		{
			log_info(_("%s Starting continuous witness node monitoring\n"), progname);
		}
		else if (myLocalMode == STANDBY_MODE)
		{
			log_info(_("%s Starting continuous standby node monitoring\n"), progname);
		}

		for (;;)
		{
			if (myLocalMode == WITNESS_MODE)
				WitnessMonitor();
			else if (myLocalMode == STANDBY_MODE)
				StandbyMonitor();
			sleep(SLEEP_MONITOR);

			if (got_SIGHUP)
			{
				/* if we can reload, then could need to change myLocalConn */
				if (reload_configuration(config_file, &local_options))
				{
					PQfinish(myLocalConn);
					myLocalConn = establishDBConnection(local_options.conninfo, true);
					update_registration();
				}
				got_SIGHUP = false;
			}
		}
		break;
	default:
		log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
	}

	/* Prevent a double-free */
	if (primaryConn == myLocalConn)
		myLocalConn = NULL;

	/* close the connection to the database and cleanup */
	CloseConnections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}
Exemple #12
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char conninfo[MAXLEN];
	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(0);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(0);
		}
	}


	while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		default:
			fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
			exit(1);
		}
	}

	setup_cancel_handler();

	if (config_file == NULL)
	{
		config_file = malloc(5 + sizeof(CONFIG_FILE));
		sprintf(config_file, "./%s", CONFIG_FILE);
	}

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	myLocalConn = establishDBConnection(conninfo, true);

	/* should be v9 or better */
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(myLocalConn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		exit(1);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE;
	if (myLocalMode == PRIMARY_MODE)
	{
		primaryId = myLocalId;
		strcpy(primaryConninfo, conninfo);
		primaryConn = myLocalConn;
	}
	else
	{
		/* I need the id of the primary as well as a connection to it */
		primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
		if (primaryConn == NULL)
			exit(1);
	}

	checkClusterConfiguration();
	checkNodeConfiguration(conninfo);
	if (myLocalMode == STANDBY_MODE)
	{
		MonitorCheck();
	}

	/* close the connection to the database and cleanup */
	CloseConnections();

	return 0;
}
Exemple #13
0
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
MonitorExecute(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	for (connection_retries = 0; connection_retries < 15; connection_retries++)
	{
		if (PQstatus(primaryConn) != CONNECTION_OK)
		{
			fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
			/* wait 20 seconds between retries */
			sleep(20);

			PQreset(primaryConn);
		}
		else
		{
			fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname);
			break;
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		fprintf(stderr, "\n%s: We couldn't reconnect to master, checking if ", progname);
		fprintf(stderr, "%s: another node has been promoted.\n", progname);
		for (connection_retries = 0; connection_retries < 6; connection_retries++)
		{
			primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
			if (PQstatus(primaryConn) == CONNECTION_OK)
			{
				/* Connected, we can continue the process so break the loop */
				fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId);
				break;
			}
			else
			{
				fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
				/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
				sleep(300);
			}
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		fprintf(stderr, "\n%s: We couldn't reconnect for long enough, exiting...\n", progname);
		exit(1);
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		fprintf(stderr, "\n%s: seems like we have been promoted, so exit from monitoring...\n",
		        progname);
		CloseConnections();
		exit(1);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sprintf(sqlquery,
	        "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	        "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strcpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0));
	strcpy(last_wal_standby_received , PQgetvalue(res, 0, 1));
	strcpy(last_wal_standby_applied , PQgetvalue(res, 0, 2));
	PQclear(res);

	/* Get primary xlog info */
	sprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strcpy(last_wal_primary_location, PQgetvalue(res, 0, 0));
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	/*
	 * Build the SQL to execute on primary
	 */
	sprintf(sqlquery,
	        "INSERT INTO repmgr_%s.repl_monitor "
	        "VALUES(%d, %d, '%s'::timestamp with time zone, "
	        " '%s', '%s', "
	        " %lld, %lld)", myClusterName,
	        primaryId, myLocalId, monitor_standby_timestamp,
	        last_wal_primary_location,
	        last_wal_standby_received,
	        (lsn_primary - lsn_standby_received),
	        (lsn_standby_received - lsn_standby_applied));

	/*
	 * Execute the query asynchronously, but don't check for a result. We
	 * will check the result next time we pause for a monitor step.
	 */
	if (PQsendQuery(primaryConn, sqlquery) == 0)
		fprintf(stderr, "Query could not be sent to primary. %s\n",
		        PQerrorMessage(primaryConn));
}