예제 #1
0
파일: repmgrd.c 프로젝트: klando/repmgr
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
StandbyMonitor(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds

	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		if (local_options.failover == MANUAL_FAILOVER)
		{
			log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
			for (connection_retries = 0; connection_retries < 6; connection_retries++)
			{
				primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node,
				                                  local_options.cluster_name, &primary_options.node, NULL);
				if (PQstatus(primaryConn) == CONNECTION_OK)
				{
					/* Connected, we can continue the process so break the loop */
					log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
					break;
				}
				else
				{
					log_err(_("We haven't found a new master, waiting before retry...\n"));
					/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
					sleep(300);
				}
			}

			if (PQstatus(primaryConn) != CONNECTION_OK)
			{
				log_err(_("We couldn't reconnect for long enough, exiting...\n"));
				exit(ERR_DB_CON);
			}
		}
		else if (local_options.failover == AUTOMATIC_FAILOVER)
		{
			/*
			 * When we returns from this function we will have a new primary and
			 * a new primaryConn
			 */
			do_failover();
		}
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
		CloseConnections();
		exit(ERR_PROMOTED);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sqlquery_snprintf(
	    sqlquery,
	    "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	    "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN);
	PQclear(res);

	/* Get primary xlog info */
	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	/*
	 * Build the SQL to execute on primary
	 */
	sqlquery_snprintf(sqlquery,
	                  "INSERT INTO %s.repl_monitor "
	                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
	                  " '%s', '%s', "
	                  " %lld, %lld)", repmgr_schema,
	                  primary_options.node, local_options.node, monitor_standby_timestamp,
	                  last_wal_primary_location,
	                  last_wal_standby_received,
	                  (lsn_primary - lsn_standby_received),
	                  (lsn_standby_received - lsn_standby_applied));

	/*
	 * Execute the query asynchronously, but don't check for a result. We
	 * will check the result next time we pause for a monitor step.
	 */
	if (PQsendQuery(primaryConn, sqlquery) == 0)
		log_warning(_("Query could not be sent to primary. %s\n"),
		            PQerrorMessage(primaryConn));
}
예제 #2
0
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
MonitorExecute(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	for (connection_retries = 0; connection_retries < 15; connection_retries++)
	{
		if (PQstatus(primaryConn) != CONNECTION_OK)
		{
			log_warning(_("Connection to master has been lost, trying to recover...\n"));
			/* wait 20 seconds between retries */
			sleep(20);

			PQreset(primaryConn);
		}
		else
		{
			if (connection_retries > 0)
			{
				log_notice(_("Connection to master has been restored, continue monitoring.\n"));
			}
			break;
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
		for (connection_retries = 0; connection_retries < 6; connection_retries++)
		{
			primaryConn = getMasterConnection(myLocalConn, local_options.node,
			                                  local_options.cluster_name, &primary_options.node,NULL);
			if (PQstatus(primaryConn) == CONNECTION_OK)
			{
				/* Connected, we can continue the process so break the loop */
				log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
				break;
			}
			else
			{
				log_err(_("We haven't found a new master, waiting before retry...\n"));
				/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
				sleep(300);
			}
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		log_err(_("We couldn't reconnect for long enough, exiting...\n"));
		exit(ERR_DB_CON);
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
		CloseConnections();
		exit(ERR_PROMOTED);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sqlquery_snprintf(
	    sqlquery,
	    "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	    "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN);
	PQclear(res);

	/* Get primary xlog info */
	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err("PQexec failed: %s\n", PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	if (only_one_entry && only_one_entry_desired)
	{
		sqlquery_snprintf(sqlquery,
		                  "UPDATE %s.repl_monitor "
		                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
		                  " '%s', '%s', "
		                  " %lld, %lld)"
		                  "WHERE primary_node=%d AND secondary_node=%d", repmgr_schema,
		                  primary_options.node, local_options.node, monitor_standby_timestamp,
		                  last_wal_primary_location,
		                  last_wal_standby_received,
		                  (lsn_primary - lsn_standby_received),
		                  (lsn_standby_received - lsn_standby_applied));
		res = PQexec(primaryConn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_err("PQexec failed: %s\n", PQerrorMessage(conn));
			PQclear(res);
			CloseConnections();
			exit(ERR_DB_QUERY);
		}
		if (PQntuples(res) != 1)
		{
			only_one_entry = false;
		}
		PQclear(res);
	}
	else
	{
		/*
		 * Build and send insert
		 */
		sqlquery_snprintf(sqlquery,
		                  "INSERT INTO %s.repl_monitor "
		                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
		                  " '%s', '%s', "
		                  " %lld, %lld)", repmgr_schema,
		                  primary_options.node, local_options.node, monitor_standby_timestamp,
		                  last_wal_primary_location,
		                  last_wal_standby_received,
		                  (lsn_primary - lsn_standby_received),
		                  (lsn_standby_received - lsn_standby_applied));
		res = PQexec(primaryConn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_err("PQexec failed: %s\n", PQerrorMessage(conn));
			PQclear(res);
			CloseConnections();
			exit(ERR_DB_QUERY);
		}
		PQclear(res);

		if (only_one_entry_desired)
		{
			/*
			 * Build the SQL to execute on primary
			 */
			sqlquery_snprintf(sqlquery,
			                  "DELETE FROM %s.repl_monitor "
			                  "WHERE primary_node=%d AND standby_node=%d AND last_monitor_time < '%s'::timestamp with time zone",
			                  repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp);
			res = PQexec(primaryConn, sqlquery);
			if (PQresultStatus(res) != PGRES_TUPLES_OK)
			{
				log_err("PQexec failed: %s\n", PQerrorMessage(conn));
				PQclear(res);
				CloseConnections();
				exit(ERR_DB_QUERY);
			}
			PQclear(res);
			only_one_entry = true;
		}
	}
}
예제 #3
0
파일: repmgrd.c 프로젝트: gbartolini/repmgr
/*
 * Insert monitor info, this is basically the time and xlog replayed,
 * applied on standby and current xlog location in primary.
 * Also do the math to see how far are we in bytes for being uptodate
 */
static void
MonitorExecute(void)
{
	PGresult *res;
	char monitor_standby_timestamp[MAXLEN];
	char last_wal_primary_location[MAXLEN];
	char last_wal_standby_received[MAXLEN];
	char last_wal_standby_applied[MAXLEN];

	unsigned long long int lsn_primary;
	unsigned long long int lsn_standby_received;
	unsigned long long int lsn_standby_applied;

	int	connection_retries;

	/*
	 * Check if the master is still available, if after 5 minutes of retries
	 * we cannot reconnect, try to get a new master.
	 */
	for (connection_retries = 0; connection_retries < 15; connection_retries++)
	{
		if (PQstatus(primaryConn) != CONNECTION_OK)
		{
			fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
			/* wait 20 seconds between retries */
			sleep(20);

			PQreset(primaryConn);
		}
		else
		{
			fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname);
			break;
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		fprintf(stderr, "\n%s: We couldn't reconnect to master, checking if ", progname);
		fprintf(stderr, "%s: another node has been promoted.\n", progname);
		for (connection_retries = 0; connection_retries < 6; connection_retries++)
		{
			primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
			if (PQstatus(primaryConn) == CONNECTION_OK)
			{
				/* Connected, we can continue the process so break the loop */
				fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId);
				break;
			}
			else
			{
				fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
				/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
				sleep(300);
			}
		}
	}
	if (PQstatus(primaryConn) != CONNECTION_OK)
	{
		fprintf(stderr, "\n%s: We couldn't reconnect for long enough, exiting...\n", progname);
		exit(1);
	}

	/* Check if we still are a standby, we could have been promoted */
	if (!is_standby(myLocalConn))
	{
		fprintf(stderr, "\n%s: seems like we have been promoted, so exit from monitoring...\n",
		        progname);
		CloseConnections();
		exit(1);
	}

	/*
	 * first check if there is a command being executed,
	 * and if that is the case, cancel the query so i can
	 * insert the current record
	 */
	if (PQisBusy(primaryConn) == 1)
		CancelQuery();

	/* Get local xlog info */
	sprintf(sqlquery,
	        "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
	        "pg_last_xlog_replay_location()");

	res = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(myLocalConn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strcpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0));
	strcpy(last_wal_standby_received , PQgetvalue(res, 0, 1));
	strcpy(last_wal_standby_applied , PQgetvalue(res, 0, 2));
	PQclear(res);

	/* Get primary xlog info */
	sprintf(sqlquery, "SELECT pg_current_xlog_location() ");

	res = PQexec(primaryConn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(primaryConn));
		PQclear(res);
		return;
	}

	strcpy(last_wal_primary_location, PQgetvalue(res, 0, 0));
	PQclear(res);

	/* Calculate the lag */
	lsn_primary = walLocationToBytes(last_wal_primary_location);
	lsn_standby_received = walLocationToBytes(last_wal_standby_received);
	lsn_standby_applied = walLocationToBytes(last_wal_standby_applied);

	/*
	 * Build the SQL to execute on primary
	 */
	sprintf(sqlquery,
	        "INSERT INTO repmgr_%s.repl_monitor "
	        "VALUES(%d, %d, '%s'::timestamp with time zone, "
	        " '%s', '%s', "
	        " %lld, %lld)", myClusterName,
	        primaryId, myLocalId, monitor_standby_timestamp,
	        last_wal_primary_location,
	        last_wal_standby_received,
	        (lsn_primary - lsn_standby_received),
	        (lsn_standby_received - lsn_standby_applied));

	/*
	 * Execute the query asynchronously, but don't check for a result. We
	 * will check the result next time we pause for a monitor step.
	 */
	if (PQsendQuery(primaryConn, sqlquery) == 0)
		fprintf(stderr, "Query could not be sent to primary. %s\n",
		        PQerrorMessage(primaryConn));
}