/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void StandbyMonitor(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds if (PQstatus(primaryConn) != CONNECTION_OK) { if (local_options.failover == MANUAL_FAILOVER) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node, local_options.cluster_name, &primary_options.node, NULL); if (PQstatus(primaryConn) == CONNECTION_OK) { /* Connected, we can continue the process so break the loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */ sleep(300); } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); exit(ERR_DB_CON); } } else if (local_options.failover == AUTOMATIC_FAILOVER) { /* * When we returns from this function we will have a new primary and * a new primaryConn */ do_failover(); } } /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); CloseConnections(); exit(ERR_PROMOTED); } /* * first check if there is a command being executed, * and if that is the case, cancel the query so i can * insert the current record */ if (PQisBusy(primaryConn) == 1) CancelQuery(); /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location()"); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(primaryConn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = walLocationToBytes(last_wal_primary_location); lsn_standby_received = walLocationToBytes(last_wal_standby_received); lsn_standby_applied = walLocationToBytes(last_wal_standby_applied); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); /* * Execute the query asynchronously, but don't check for a result. We * will check the result next time we pause for a monitor step. */ if (PQsendQuery(primaryConn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primaryConn)); }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void MonitorExecute(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ for (connection_retries = 0; connection_retries < 15; connection_retries++) { if (PQstatus(primaryConn) != CONNECTION_OK) { log_warning(_("Connection to master has been lost, trying to recover...\n")); /* wait 20 seconds between retries */ sleep(20); PQreset(primaryConn); } else { if (connection_retries > 0) { log_notice(_("Connection to master has been restored, continue monitoring.\n")); } break; } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primaryConn = getMasterConnection(myLocalConn, local_options.node, local_options.cluster_name, &primary_options.node,NULL); if (PQstatus(primaryConn) == CONNECTION_OK) { /* Connected, we can continue the process so break the loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */ sleep(300); } } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); exit(ERR_DB_CON); } /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); CloseConnections(); exit(ERR_PROMOTED); } /* * first check if there is a command being executed, * and if that is the case, cancel the query so i can * insert the current record */ if (PQisBusy(primaryConn) == 1) CancelQuery(); /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location()"); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(primaryConn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = walLocationToBytes(last_wal_primary_location); lsn_standby_received = walLocationToBytes(last_wal_standby_received); lsn_standby_applied = walLocationToBytes(last_wal_standby_applied); if (only_one_entry && only_one_entry_desired) { sqlquery_snprintf(sqlquery, "UPDATE %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)" "WHERE primary_node=%d AND secondary_node=%d", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } if (PQntuples(res) != 1) { only_one_entry = false; } PQclear(res); } else { /* * Build and send insert */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } PQclear(res); if (only_one_entry_desired) { /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor " "WHERE primary_node=%d AND standby_node=%d AND last_monitor_time < '%s'::timestamp with time zone", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } PQclear(res); only_one_entry = true; } } }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void MonitorExecute(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ for (connection_retries = 0; connection_retries < 15; connection_retries++) { if (PQstatus(primaryConn) != CONNECTION_OK) { fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname); /* wait 20 seconds between retries */ sleep(20); PQreset(primaryConn); } else { fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname); break; } } if (PQstatus(primaryConn) != CONNECTION_OK) { fprintf(stderr, "\n%s: We couldn't reconnect to master, checking if ", progname); fprintf(stderr, "%s: another node has been promoted.\n", progname); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId); if (PQstatus(primaryConn) == CONNECTION_OK) { /* Connected, we can continue the process so break the loop */ fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId); break; } else { fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname); /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */ sleep(300); } } } if (PQstatus(primaryConn) != CONNECTION_OK) { fprintf(stderr, "\n%s: We couldn't reconnect for long enough, exiting...\n", progname); exit(1); } /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { fprintf(stderr, "\n%s: seems like we have been promoted, so exit from monitoring...\n", progname); CloseConnections(); exit(1); } /* * first check if there is a command being executed, * and if that is the case, cancel the query so i can * insert the current record */ if (PQisBusy(primaryConn) == 1) CancelQuery(); /* Get local xlog info */ sprintf(sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location()"); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(myLocalConn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strcpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0)); strcpy(last_wal_standby_received , PQgetvalue(res, 0, 1)); strcpy(last_wal_standby_applied , PQgetvalue(res, 0, 2)); PQclear(res); /* Get primary xlog info */ sprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "PQexec failed: %s\n", PQerrorMessage(primaryConn)); PQclear(res); return; } strcpy(last_wal_primary_location, PQgetvalue(res, 0, 0)); PQclear(res); /* Calculate the lag */ lsn_primary = walLocationToBytes(last_wal_primary_location); lsn_standby_received = walLocationToBytes(last_wal_standby_received); lsn_standby_applied = walLocationToBytes(last_wal_standby_applied); /* * Build the SQL to execute on primary */ sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)", myClusterName, primaryId, myLocalId, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); /* * Execute the query asynchronously, but don't check for a result. We * will check the result next time we pause for a monitor step. */ if (PQsendQuery(primaryConn, sqlquery) == 0) fprintf(stderr, "Query could not be sent to primary. %s\n", PQerrorMessage(primaryConn)); }