Beispiel #1
0
/*
 * get a connection to master by reading repl_nodes, creating a connection
 * to each node (one at a time) and finding if it is a master or a standby
 *
 * NB: If master_conninfo_out may be NULL.  If it is non-null, it is assumed to
 * point to allocated memory of MAXCONNINFO in length, and the master server
 * connection string is placed there.
 */
PGconn *
getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
                    int *master_id, char *master_conninfo_out)
{
	PGconn		*master_conn	 = NULL;
	PGresult	*res1;
	PGresult	*res2;
	char		 sqlquery[QUERY_STR_LEN];
	char		 master_conninfo_stack[MAXCONNINFO];
	char		*master_conninfo = &*master_conninfo_stack;
	char		 schema_quoted[MAXLEN];

	int		 i;

	/*
	 * If the caller wanted to get a copy of the connection info string, sub
	 * out the local stack pointer for the pointer passed by the caller.
	 */
	if (master_conninfo_out != NULL)
		master_conninfo = master_conninfo_out;

	/*
	 * XXX: This is copied in at least two other procedures
	 *
	 * Assemble the unquoted schema name
	 */
	{
		char *identifier = PQescapeIdentifier(standby_conn, schema,
		                                      strlen(schema));

		maxlen_snprintf(schema_quoted, "%s", identifier);
		PQfreemem(identifier);
	}

	/* find all nodes belonging to this cluster */
	log_info(_("finding node list for cluster '%s'\n"),
	         cluster);

	sqlquery_snprintf(sqlquery, "SELECT id, conninfo FROM %s.repl_nodes "
	                  " WHERE cluster = '%s' and not witness",
	                  schema_quoted, cluster);

	res1 = PQexec(standby_conn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("Can't get nodes info: %s\n"),
		        PQerrorMessage(standby_conn));
		PQclear(res1);
		PQfinish(standby_conn);
		exit(ERR_DB_QUERY);
	}

	for (i = 0; i < PQntuples(res1); i++)
	{
		/* initialize with the values of the current node being processed */
		*master_id = atoi(PQgetvalue(res1, i, 0));
		strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO);
		log_info(_("checking role of cluster node '%s'\n"),
		         master_conninfo);
		master_conn = establishDBConnection(master_conninfo, false);

		if (PQstatus(master_conn) != CONNECTION_OK)
			continue;

		/*
		 * Can't use the is_standby() function here because on error that
		 * function closes the connection passed and exits.  This still
		 * needs to close master_conn first.
		 */
		res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()");

		if (PQresultStatus(res2) != PGRES_TUPLES_OK)
		{
			log_err(_("Can't get recovery state from this node: %s\n"),
			        PQerrorMessage(master_conn));
			PQclear(res2);
			PQfinish(master_conn);
			continue;
		}

		/* if false, this is the master */
		if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0)
		{
			PQclear(res2);
			PQclear(res1);
			return master_conn;
		}
		else
		{
			/* if it is a standby clear info */
			PQclear(res2);
			PQfinish(master_conn);
			*master_id = -1;
		}
	}

	/* If we finish this loop without finding a master then
	 * we doesn't have the info or the master has failed (or we
	 * reached max_connections or superuser_reserved_connections,
	 * anything else I'm missing?).
	 *
	 * Probably we will need to check the error to know if we need
	 * to start failover procedure or just fix some situation on the
	 * standby.
	 */
	PQclear(res1);
	return NULL;
}
Beispiel #2
0
static void
do_standby_promote(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];
	char 		script[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	PGconn		*old_master_conn;
	int			old_master_id;

	int			r;
	char		data_dir[MAXLEN];
	char		recovery_file_path[MAXLEN];
	char		recovery_done_path[MAXLEN];

	char	standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	/* We need to connect to check configuration */
	conn = establishDBConnection(conninfo, true);

	/* we need v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are in a standby node */
	if (!is_standby(conn))
	{
		fprintf(stderr, "repmgr: The command should be executed in a standby node\n");
		return;
	}

	/* we also need to check if there isn't any master already */
	old_master_conn = getMasterConnection(conn, myLocalId, myClusterName, &old_master_id);
	if (old_master_conn != NULL)
	{
		PQfinish(old_master_conn);
		fprintf(stderr, "There is a master already in this cluster");
		return;
	}

	if (verbose)
		printf(_("\n%s: Promoting standby...\n"), progname);

	/* Get the data directory full path and the last subdirectory */
	sprintf(sqlquery, "SELECT setting "
	        " FROM pg_settings WHERE name = 'data_directory'");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	strcpy(data_dir, PQgetvalue(res, 0, 0));
	PQclear(res);
	PQfinish(conn);

	sprintf(recovery_file_path, "%s/%s", data_dir, RECOVERY_FILE);
	sprintf(recovery_done_path, "%s/%s", data_dir, RECOVERY_DONE_FILE);
	rename(recovery_file_path, recovery_done_path);

	/* We assume the pg_ctl script is in the PATH */
	sprintf(script, "pg_ctl -D %s -m fast restart", data_dir);
	r = system(script);
	if (r != 0)
	{
		fprintf(stderr, "Can't restart service\n");
		return;
	}

	/* reconnect to check we got promoted */
	/*
	 * XXX i'm removing this because it gives an annoying message saying couldn't connect
	 * but is just the server starting up
	*    conn = establishDBConnection(conninfo, true);
	*    if (is_standby(conn))
	*    	fprintf(stderr, "\n%s: STANDBY PROMOTE failed, this is still a standby node.\n", progname);
	*    else
	*    	fprintf(stderr, "\n%s: you should REINDEX any hash indexes you have.\n", progname);
	*    PQfinish(conn);
	*/

	return;
}
Beispiel #3
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{"no-history", no_argument, NULL, 'H'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(SUCCESS);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(SUCCESS);
		}
	}

	while ((c = getopt_long(argc, argv, "f:vH", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		case 'H': /* no-history */
			only_one_entry_desired = true;
			break;
		default:
			usage();
			exit(ERR_BAD_CONFIG);
		}
	}

	setup_cancel_handler();

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, &local_options);
	if (local_options.node == -1)
	{
		log_err("Node information is missing. "
		        "Check the configuration file, or provide one if you have not done so.\n");
		exit(ERR_BAD_CONFIG);
	}

	logger_init(progname, local_options.loglevel, local_options.logfacility);
	if (verbose)
		logger_min_verbose(LOG_INFO);

	snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name);

	log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo);
	myLocalConn = establishDBConnection(local_options.conninfo, true);

	/* should be v9 or better */
	log_info(_("%s Connected to database, checking its state\n"), progname);
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(myLocalConn);
		log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		exit(ERR_BAD_CONFIG);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE;
	if (myLocalMode == PRIMARY_MODE)
	{
		primary_options.node = local_options.node;
		strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
		primaryConn = myLocalConn;
	}
	else
	{
		/* I need the id of the primary as well as a connection to it */
		log_info(_("%s Connecting to primary for cluster '%s'\n"),
		         progname, local_options.cluster_name);
		primaryConn = getMasterConnection(myLocalConn, local_options.node,
		                                  local_options.cluster_name,
		                                  &primary_options.node,NULL);
		if (primaryConn == NULL)
		{
			CloseConnections();
			exit(ERR_BAD_CONFIG);
		}
	}

	checkClusterConfiguration(myLocalConn,primaryConn);
	checkNodeConfiguration(local_options.conninfo);
	if (myLocalMode == STANDBY_MODE)
	{
		log_info(_("%s Starting continuous standby node monitoring\n"), progname);
		MonitorCheck();
	}
	else
	{
		log_info(_("%s This is a primary node, program not needed here; exiting'\n"), progname);
	}

	/* Prevent a double-free */
	if (primaryConn == myLocalConn)
		myLocalConn = NULL;

	/* close the connection to the database and cleanup */
	CloseConnections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}
Beispiel #4
0
static void
do_master_register(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	bool		schema_exists = false;
	char master_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	conn = establishDBConnection(conninfo, true);

	/* master should be v9 or better */
	pg_version(conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are a master */
	if (is_standby(conn))
	{
		fprintf(stderr, "repmgr: This node should be a master\n");
		PQfinish(conn);
		return;
	}

	/* Check if there is a schema for this cluster */
	sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName);
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about schemas: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}

	if (PQntuples(res) > 0)			/* schema exists */
	{
		if (!force)					/* and we are not forcing so error */
		{
			fprintf(stderr, "Schema repmgr_%s already exists.", myClusterName);
			PQclear(res);
			PQfinish(conn);
			return;
		}
		schema_exists = true;
	}
	PQclear(res);

	if (!schema_exists)
	{
		/* ok, create the schema */
		sprintf(sqlquery, "CREATE SCHEMA repmgr_%s", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the schema repmgr_%s: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		/* ... the tables */
		sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_nodes (        "
		        "  id        integer primary key, "
		        "  cluster   text    not null,    "
		        "  conninfo  text    not null)", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the table repmgr_%s.repl_nodes: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_monitor ( "
		        "  primary_node                   INTEGER NOT NULL, "
		        "  standby_node                   INTEGER NOT NULL, "
		        "  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL, "
		        "  last_wal_primary_location      TEXT NOT NULL,   "
		        "  last_wal_standby_location      TEXT NOT NULL,   "
		        "  replication_lag                BIGINT NOT NULL, "
		        "  apply_lag                      BIGINT NOT NULL) ", myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the table repmgr_%s.repl_monitor: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}

		/* and the view */
		sprintf(sqlquery, "CREATE VIEW repmgr_%s.repl_status AS "
		        "  WITH monitor_info AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY primary_node, standby_node "
		        " ORDER BY last_monitor_time desc) "
		        "  FROM repmgr_%s.repl_monitor) "
		        "  SELECT primary_node, standby_node, last_monitor_time, last_wal_primary_location, "
		        "         last_wal_standby_location, pg_size_pretty(replication_lag) replication_lag, "
		        "         pg_size_pretty(apply_lag) apply_lag, age(now(), last_monitor_time) AS time_lag "
		        "    FROM monitor_info a "
		        "   WHERE row_number = 1", myClusterName, myClusterName);
		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot create the view repmgr_%s.repl_status: %s\n",
			        myClusterName, PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}
	}
	else
	{
		PGconn *master_conn;
		int 	id;

		/* Ensure there isn't any other master already registered */
		master_conn = getMasterConnection(conn, myLocalId, myClusterName, &id);
		if (master_conn != NULL)
		{
			PQfinish(master_conn);
			fprintf(stderr, "There is a master already in this cluster");
			return;
		}
	}

	/* Now register the master */
	if (force)
	{
		sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes "
		        " WHERE id = %d",
		        myClusterName, myLocalId);

		if (!PQexec(conn, sqlquery))
		{
			fprintf(stderr, "Cannot delete node details, %s\n",
			        PQerrorMessage(conn));
			PQfinish(conn);
			return;
		}
	}

	sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes "
	        "VALUES (%d, '%s', '%s')",
	        myClusterName, myLocalId, myClusterName, conninfo);

	if (!PQexec(conn, sqlquery))
	{
		fprintf(stderr, "Cannot insert node details, %s\n",
		        PQerrorMessage(conn));
		PQfinish(conn);
		return;
	}

	PQfinish(conn);
	return;
}
Beispiel #5
0
static void
do_standby_register(void)
{
	PGconn 		*conn;
	PGconn		*master_conn;
	int			master_id;

	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	char master_version[MAXVERSIONSTR];
	char standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	conn = establishDBConnection(conninfo, true);

	/* should be v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* Check we are a standby */
	if (!is_standby(conn))
	{
		fprintf(stderr, "repmgr: This node should be a standby\n");
		PQfinish(conn);
		return;
	}

	/* Check if there is a schema for this cluster */
	sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName);
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about tablespaces: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}

	if (PQntuples(res) == 0)		/* schema doesn't exists */
	{
		fprintf(stderr, "Schema repmgr_%s doesn't exists.", myClusterName);
		PQclear(res);
		PQfinish(conn);
		return;
	}
	PQclear(res);

	/* check if there is a master in this cluster */
	master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id);
	if (!master_conn)
		return;

	/* master should be v9 or better */
	pg_version(master_conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* master and standby version should match */
	if (strcmp(master_version, standby_version) != 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"),
		        progname, master_version, standby_version);
		return;
	}


	/* Now register the standby */
	if (force)
	{
		sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes "
		        " WHERE id = %d",
		        myClusterName, myLocalId);

		if (!PQexec(master_conn, sqlquery))
		{
			fprintf(stderr, "Cannot delete node details, %s\n",
			        PQerrorMessage(master_conn));
			PQfinish(master_conn);
			PQfinish(conn);
			return;
		}
	}

	sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes "
	        "VALUES (%d, '%s', '%s')",
	        myClusterName, myLocalId, myClusterName, conninfo);

	if (!PQexec(master_conn, sqlquery))
	{
		fprintf(stderr, "Cannot insert node details, %s\n",
		        PQerrorMessage(master_conn));
		PQfinish(master_conn);
		PQfinish(conn);
		return;
	}

	PQfinish(master_conn);
	PQfinish(conn);
	return;
}
Beispiel #6
0
static void
do_standby_follow(void)
{
	PGconn 		*conn;
	PGresult	*res;
	char 		sqlquery[QUERY_STR_LEN];
	char 		script[QUERY_STR_LEN];

	char    	myClusterName[MAXLEN];
	int     	myLocalId   = -1;
	char 		conninfo[MAXLEN];

	PGconn		*master_conn;
	int			master_id;

	int			r;
	char		data_dir[MAXLEN];

	char	master_version[MAXVERSIONSTR];
	char	standby_version[MAXVERSIONSTR];

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	/* We need to connect to check configuration */
	conn = establishDBConnection(conninfo, true);

	/* Check we are in a standby node */
	if (!is_standby(conn))
	{
		fprintf(stderr, "\n%s: The command should be executed in a standby node\n", progname);
		return;
	}

	/* should be v9 or better */
	pg_version(conn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(conn);
		fprintf(stderr, _("\n%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* we also need to check if there is any master in the cluster */
	master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id);
	if (master_conn == NULL)
	{
		PQfinish(conn);
		fprintf(stderr, "There isn't a master to follow in this cluster");
		return;
	}

	/* Check we are going to point to a master */
	if (is_standby(master_conn))
	{
		PQfinish(conn);
		fprintf(stderr, "%s: The node to follow should be a master\n", progname);
		return;
	}

	/* should be v9 or better */
	pg_version(master_conn, master_version);
	if (strcmp(master_version, "") == 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
		return;
	}

	/* master and standby version should match */
	if (strcmp(master_version, standby_version) != 0)
	{
		PQfinish(conn);
		PQfinish(master_conn);
		fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"),
		        progname, master_version, standby_version);
		return;
	}

	/*
	 * set the host and masterport variables with the master ones
	 * before closing the connection because we will need them to
	 * recreate the recovery.conf file
	 */
	host = malloc(20);
	masterport = malloc(10);
	strcpy(host, PQhost(master_conn));
	strcpy(masterport, PQport(master_conn));
	PQfinish(master_conn);

	if (verbose)
		printf(_("\n%s: Changing standby's master...\n"), progname);

	/* Get the data directory full path */
	sprintf(sqlquery, "SELECT setting "
	        " FROM pg_settings WHERE name = 'data_directory'");
	res = PQexec(conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn));
		PQclear(res);
		PQfinish(conn);
		return;
	}
	strcpy(data_dir, PQgetvalue(res, 0, 0));
	PQclear(res);
	PQfinish(conn);

	/* write the recovery.conf file */
	if (!create_recovery_file(data_dir))
		return;

	/* Finally, restart the service */
	/* We assume the pg_ctl script is in the PATH */
	sprintf(script, "pg_ctl -D %s -m fast restart", data_dir);
	r = system(script);
	if (r != 0)
	{
		fprintf(stderr, "Can't restart service\n");
		return;
	}

	return;
}
Beispiel #7
0
static void
do_failover(void)
{
	PGresult *res1;
	PGresult *res2;
	char 	sqlquery[8192];

	int		total_nodes = 0;
	int		visible_nodes = 0;
	bool	find_best = false;

	int		i;
	int		r;

	int 	node;
	char	nodeConninfo[MAXLEN];

	unsigned int uxlogid;
	unsigned int uxrecoff;
	char last_wal_standby_applied[MAXLEN];

	PGconn	*nodeConn = NULL;

	/*
	 * will get info about until 50 nodes,
	 * which seems to be large enough for most scenarios
	 */
	nodeInfo nodes[50];
	nodeInfo best_candidate;

	/* first we get info about this node, and update shared memory */
	sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()");
	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
		update_shared_memory(last_wal_standby_applied);
		exit(ERR_DB_QUERY);
	}

	/* write last location in shared memory */
	update_shared_memory(PQgetvalue(res1, 0, 0));

	/*
	 * we sleep the monitor time + one second
	 * we bet it should be enough for other repmgrd to update their own data
	 */
	sleep(SLEEP_MONITOR + 1);

	/* get a list of standby nodes, including myself */
	sprintf(sqlquery, "SELECT id, conninfo "
	        "  FROM %s.repl_nodes "
	        " WHERE id IN (SELECT standby_node FROM %s.repl_status) "
	        "   AND cluster = '%s' "
	        " ORDER BY priority ",
	        repmgr_schema, repmgr_schema, local_options.cluster_name);

	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		PQfinish(myLocalConn);
		exit(ERR_DB_QUERY);
	}

	/* ask for the locations */
	for (i = 0; i < PQntuples(res1); i++)
	{
		node = atoi(PQgetvalue(res1, i, 0));
		/* Initialize on false so if we can't reach this node we know that later */
		nodes[i].is_ready = false;
		strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
		nodeConn = establishDBConnection(nodeConninfo, false);
		/* if we can't see the node just skip it */
		if (PQstatus(nodeConn) != CONNECTION_OK)
			continue;

		sqlquery_snprintf(sqlquery, "SELECT repmgr_get_last_standby_location()");
		res2 = PQexec(nodeConn, sqlquery);
		if (PQresultStatus(res2) != PGRES_TUPLES_OK)
		{
			log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
			log_info(_("Connection details: %s\n"), nodeConninfo);
			PQclear(res2);
			PQfinish(nodeConn);
			continue;
		}

		visible_nodes++;

		if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
			log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));

		nodes[i].nodeId = node;
		nodes[i].xlog_location.xlogid = uxlogid;
		nodes[i].xlog_location.xrecoff = uxrecoff;
		nodes[i].is_ready = true;

		PQclear(res2);
		PQfinish(nodeConn);
	}
	PQclear(res1);
	/* Close the connection to this server */
	PQfinish(myLocalConn);

	/*
	 * total nodes that are registered, include master which is a node but was
	 * not counted because it's not a standby
	 */
	total_nodes = i + 1;

	/*
	 * am i on the group that should keep alive?
	 * if i see less than half of total_nodes then i should do nothing
	 */
	if (visible_nodes < (total_nodes / 2.0))
	{
		log_err(_("Can't reach most of the nodes.\n"
		          "Let the other standby servers decide which one will be the primary.\n"
		          "Manual action will be needed to readd this node to the cluster.\n"));
		exit(ERR_FAILOVER_FAIL);
	}

	/*
	 * determine which one is the best candidate to promote to primary
	 */
	for (i = 0; i < total_nodes - 1; i++)
	{
		if (!nodes[i].is_ready)
			continue;
		else if (!find_best)
		{
			/* start with the first ready node, and then move on to the next one */
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
			find_best = true;
		}

		/* we use the macros provided by xlogdefs.h to compare XLogPtr */
		/*
		 * Nodes are retrieved ordered by priority, so if the current
		 * best candidate is lower or equal to the next node's wal location
		 * then assign next node as the new best candidate.
		 */
		if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location))
		{
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
		}
	}

	/* once we know who is the best candidate, promote it */
	if (find_best && (best_candidate.nodeId == local_options.node))
	{
		if (verbose)
			log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
			         progname);
		log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command);
		r = system(local_options.promote_command);
		if (r != 0)
		{
			log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else if (find_best)
	{
		if (verbose)
			log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
			         progname, best_candidate.nodeId);
		log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
		/*
		 * New Primary need some time to be promoted.
		 * The follow command should take care of that.
		 */
		r = system(local_options.follow_command);
		if (r != 0)
		{
			log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else
	{
		log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname);
		exit(ERR_FAILOVER_FAIL);
	}

	/* and reconnect to the local database */
	myLocalConn = establishDBConnection(local_options.conninfo, true);
}
Beispiel #8
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(SUCCESS);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(SUCCESS);
		}
	}

	while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		default:
			usage();
			exit(ERR_BAD_CONFIG);
		}
	}

	setup_event_handlers();

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, &local_options);
	if (local_options.node == -1)
	{
		log_err(_("Node information is missing. "
		          "Check the configuration file, or provide one if you have not done so.\n"));
		exit(ERR_BAD_CONFIG);
	}

	logger_init(progname, local_options.loglevel, local_options.logfacility);
	if (verbose)
		logger_min_verbose(LOG_INFO);

	snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name);

	log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo);
	myLocalConn = establishDBConnection(local_options.conninfo, true);

	/* should be v9 or better */
	log_info(_("%s Connected to database, checking its state\n"), progname);
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		PQfinish(myLocalConn);
		exit(ERR_BAD_CONFIG);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
		myLocalMode = WITNESS_MODE;
	else if (is_standby(myLocalConn))
		myLocalMode = STANDBY_MODE;
	else /* is the master */
		myLocalMode = PRIMARY_MODE;

	switch (myLocalMode)
	{
	case PRIMARY_MODE:
		primary_options.node = local_options.node;
		strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
		primaryConn = myLocalConn;

		checkClusterConfiguration(myLocalConn, primaryConn);
		checkNodeConfiguration(local_options.conninfo);

		if (reload_configuration(config_file, &local_options))
		{
			PQfinish(myLocalConn);
			myLocalConn = establishDBConnection(local_options.conninfo, true);
			primaryConn = myLocalConn;
			update_registration();
		}

		log_info(_("%s Starting continuous primary connection check\n"), progname);
		/* Check that primary is still alive, and standbies are sending info */
		/*
		 * Every SLEEP_MONITOR seconds, do master checks
		 * XXX
		 * Check that standbies are sending info
		*/
		for (;;)
		{
			if (CheckPrimaryConnection())
			{
				/*
									CheckActiveStandbiesConnections();
									CheckInactiveStandbies();
				*/
				sleep(SLEEP_MONITOR);
			}
			else
			{
				/* XXX
				 * May we do something more verbose ?
				 */
				exit (1);
			}

			if (got_SIGHUP)
			{
				/* if we can reload, then could need to change myLocalConn */
				if (reload_configuration(config_file, &local_options))
				{
					PQfinish(myLocalConn);
					myLocalConn = establishDBConnection(local_options.conninfo, true);
					primaryConn = myLocalConn;
					update_registration();
				}
				got_SIGHUP = false;
			}
		}
		break;
	case WITNESS_MODE:
	case STANDBY_MODE:
		/* I need the id of the primary as well as a connection to it */
		log_info(_("%s Connecting to primary for cluster '%s'\n"),
		         progname, local_options.cluster_name);
		primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node,
		                                  local_options.cluster_name,
		                                  &primary_options.node, NULL);
		if (primaryConn == NULL)
		{
			CloseConnections();
			exit(ERR_BAD_CONFIG);
		}

		checkClusterConfiguration(myLocalConn, primaryConn);
		checkNodeConfiguration(local_options.conninfo);

		if (reload_configuration(config_file, &local_options))
		{
			PQfinish(myLocalConn);
			myLocalConn = establishDBConnection(local_options.conninfo, true);
			update_registration();
		}

		/*
		 * Every SLEEP_MONITOR seconds, do checks
		 */
		if (myLocalMode == WITNESS_MODE)
		{
			log_info(_("%s Starting continuous witness node monitoring\n"), progname);
		}
		else if (myLocalMode == STANDBY_MODE)
		{
			log_info(_("%s Starting continuous standby node monitoring\n"), progname);
		}

		for (;;)
		{
			if (myLocalMode == WITNESS_MODE)
				WitnessMonitor();
			else if (myLocalMode == STANDBY_MODE)
				StandbyMonitor();
			sleep(SLEEP_MONITOR);

			if (got_SIGHUP)
			{
				/* if we can reload, then could need to change myLocalConn */
				if (reload_configuration(config_file, &local_options))
				{
					PQfinish(myLocalConn);
					myLocalConn = establishDBConnection(local_options.conninfo, true);
					update_registration();
				}
				got_SIGHUP = false;
			}
		}
		break;
	default:
		log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
	}

	/* Prevent a double-free */
	if (primaryConn == myLocalConn)
		myLocalConn = NULL;

	/* close the connection to the database and cleanup */
	CloseConnections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}
Beispiel #9
0
bool
reload_configuration(char *config_file, t_configuration_options *orig_options)
{
	PGconn	*conn;

	t_configuration_options new_options;

	/*
	 * Re-read the configuration file: repmgr.conf
	 */
	log_info(_("Reloading configuration file and updating repmgr tables\n"));
	parse_config(config_file, &new_options);
	if (new_options.node == -1)
	{
		log_warning(_("\nCannot load new configuration, will keep current one.\n"));
		return false;
	}

	if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0)
	{
		log_warning(_("\nCannot change cluster name, will keep current configuration.\n"));
		return false;
	}

	if (new_options.node != orig_options->node)
	{
		log_warning(_("\nCannot change node number, will keep current configuration.\n"));
		return false;
	}

	if (new_options.node_name != orig_options->node_name)
	{
		log_warning(_("\nCannot change standby name, will keep current configuration.\n"));
		return false;
	}

	if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER)
	{
		log_warning(_("\nNew value for failover is not valid. Should be MANUAL or AUTOMATIC.\n"));
		return false;
	}

	if (new_options.master_response_timeout <= 0)
	{
		log_warning(_("\nNew value for master_response_timeout is not valid. Should be greater than zero.\n"));
		return false;
	}

	if (new_options.reconnect_attempts < 0)
	{
		log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
		return false;
	}

	if (new_options.reconnect_intvl < 0)
	{
		log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
		return false;
	}

	/* Test conninfo string */
	conn = establishDBConnection(new_options.conninfo, false);
	if (!conn || (PQstatus(conn) != CONNECTION_OK))
	{
		log_warning(_("\nconninfo string is not valid, will keep current configuration.\n"));
		return false;
	}
	PQfinish(conn);

	/* Configuration seems ok, will load new values */
	strcpy(orig_options->cluster_name, new_options.cluster_name);
	orig_options->node = new_options.node;
	strcpy(orig_options->conninfo, new_options.conninfo);
	orig_options->failover = new_options.failover;
	orig_options->priority = new_options.priority;
	strcpy(orig_options->node_name, new_options.node_name);
	strcpy(orig_options->promote_command, new_options.promote_command);
	strcpy(orig_options->follow_command, new_options.follow_command);
	strcpy(orig_options->rsync_options, new_options.rsync_options);
	strcpy(orig_options->ssh_options, new_options.ssh_options);
	orig_options->master_response_timeout = new_options.master_response_timeout;
	orig_options->reconnect_attempts = new_options.reconnect_attempts;
	orig_options->reconnect_intvl = new_options.reconnect_intvl;
	/*
	 * XXX These ones can change with a simple SIGHUP?

		strcpy (orig_options->loglevel, new_options.loglevel);
		strcpy (orig_options->logfacility, new_options.logfacility);

		logger_shutdown();
		XXX do we have progname here ?
		logger_init(progname, orig_options.loglevel, orig_options.logfacility);
	*/

	return true;
}
Beispiel #10
0
int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;

	char conninfo[MAXLEN];
	char standby_version[MAXVERSIONSTR];

	progname = get_progname(argv[0]);

	if (argc > 1)
	{
		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
		{
			help(progname);
			exit(0);
		}
		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
		{
			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
			exit(0);
		}
	}


	while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1)
	{
		switch (c)
		{
		case 'f':
			config_file = optarg;
			break;
		case 'v':
			verbose = true;
			break;
		default:
			fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
			exit(1);
		}
	}

	setup_cancel_handler();

	if (config_file == NULL)
	{
		config_file = malloc(5 + sizeof(CONFIG_FILE));
		sprintf(config_file, "./%s", CONFIG_FILE);
	}

	/*
	 * Read the configuration file: repmgr.conf
	 */
	parse_config(config_file, myClusterName, &myLocalId, conninfo);
	if (myLocalId == -1)
	{
		fprintf(stderr, "Node information is missing. "
		        "Check the configuration file.\n");
		exit(1);
	}

	myLocalConn = establishDBConnection(conninfo, true);

	/* should be v9 or better */
	pg_version(myLocalConn, standby_version);
	if (strcmp(standby_version, "") == 0)
	{
		PQfinish(myLocalConn);
		fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
		exit(1);
	}

	/*
	 * Set my server mode, establish a connection to primary
	 * and start monitor
	 */
	myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE;
	if (myLocalMode == PRIMARY_MODE)
	{
		primaryId = myLocalId;
		strcpy(primaryConninfo, conninfo);
		primaryConn = myLocalConn;
	}
	else
	{
		/* I need the id of the primary as well as a connection to it */
		primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
		if (primaryConn == NULL)
			exit(1);
	}

	checkClusterConfiguration();
	checkNodeConfiguration(conninfo);
	if (myLocalMode == STANDBY_MODE)
	{
		MonitorCheck();
	}

	/* close the connection to the database and cleanup */
	CloseConnections();

	return 0;
}