/* check the PQStatus and try to 'select 1' to confirm good connection */ bool is_pgup(PGconn *conn, int timeout) { char sqlquery[QUERY_STR_LEN]; /* Check the connection status twice in case it changes after reset */ bool twice = false; /* Check the connection status twice in case it changes after reset */ for (;;) { if (PQstatus(conn) != CONNECTION_OK) { if (twice) return false; PQreset(conn); /* reconnect */ twice = true; } else { /* * Send a SELECT 1 just to check if the connection is OK */ if (!cancel_query(conn, timeout)) goto failed; if (wait_connection_availability(conn, timeout) != 1) goto failed; sqlquery_snprintf(sqlquery, "SELECT 1"); if (PQsendQuery(conn, sqlquery) == 0) { log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"), PQerrorMessage(conn)); goto failed; } if (wait_connection_availability(conn, timeout) != 1) goto failed; break; failed: /* * we need to retry, because we might just have lost the * connection once */ if (twice) return false; PQreset(conn); /* reconnect */ twice = true; } } return true; }
static void close_connections() { if (primary_conn != NULL && PQisBusy(primary_conn) == 1) cancel_query(primary_conn, local_options.master_response_timeout); if (my_local_conn != NULL) PQfinish(my_local_conn); if (primary_conn != NULL && primary_conn != my_local_conn) PQfinish(primary_conn); primary_conn = NULL; my_local_conn = NULL; }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void standby_monitor(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; char last_wal_standby_applied_timestamp[MAXLEN]; char sqlquery[QUERY_STR_LEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries, ret; bool did_retry = false; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ check_connection(primary_conn, "master"); /* this take up to * local_options.reconnect_atte * mpts * * local_options.reconnect_intv * l seconds */ if (!check_connection(my_local_conn, "standby")) { log_err("Failed to connect to local node, exiting!\n"); terminate(1); } if (PQstatus(primary_conn) != CONNECTION_OK) { PQfinish(primary_conn); primary_conn = NULL; if (local_options.failover == MANUAL_FAILOVER) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primary_conn = get_master_connection(my_local_conn, repmgr_schema, local_options.cluster_name, &primary_options.node, NULL); if (PQstatus(primary_conn) == CONNECTION_OK) { /* * Connected, we can continue the process so break the * loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* * wait local_options.retry_promote_interval_secs minutes * before retries, after 6 failures (6 * * local_options.monitor_interval_secs seconds) we stop * trying */ sleep(local_options.retry_promote_interval_secs); } } if (PQstatus(primary_conn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); terminate(ERR_DB_CON); } } else if (local_options.failover == AUTOMATIC_FAILOVER) { /* * When we returns from this function we will have a new primary * and a new primary_conn */ do_failover(); return; } } /* Check if we still are a standby, we could have been promoted */ do { ret = is_standby(my_local_conn); switch (ret) { case 0: log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); terminate(1); break; case -1: log_err(_("Standby node disappeared, trying to reconnect...\n")); did_retry = true; if (!check_connection(my_local_conn, "standby")) { terminate(0); } break; } } while (ret == -1); if (did_retry) { log_info(_("standby connection got back up again!\n")); } /* Fast path for the case where no history is requested */ if (!monitoring_history) return; /* * Cancel any query that is still being executed, so i can insert the * current record */ if (!cancel_query(primary_conn, local_options.master_response_timeout)) return; if (wait_connection_availability(primary_conn, local_options.master_response_timeout) != 1) return; /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN); strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primary_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(primary_conn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = wal_location_to_bytes(last_wal_primary_location); lsn_standby_received = wal_location_to_bytes(last_wal_standby_received); lsn_standby_applied = wal_location_to_bytes(last_wal_standby_applied); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s'::timestamp with time zone, '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_standby_applied_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); /* * Execute the query asynchronously, but don't check for a result. We will * check the result next time we pause for a monitor step. */ log_debug("standby_monitor: %s\n", sqlquery); if (PQsendQuery(primary_conn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primary_conn)); }
static void witness_monitor(void) { char monitor_witness_timestamp[MAXLEN]; PGresult *res; char sqlquery[QUERY_STR_LEN]; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, return false. */ check_connection(primary_conn, "master"); /* this take up to * local_options.reconnect_atte * mpts * * local_options.reconnect_intv * l seconds */ if (PQstatus(primary_conn) != CONNECTION_OK) { /* * If we can't reconnect, just exit... XXX we need to make witness * connect to the new master */ terminate(0); } /* Fast path for the case where no history is requested */ if (!monitoring_history) return; /* * Cancel any query that is still being executed, so i can insert the * current record */ if (!cancel_query(primary_conn, local_options.master_response_timeout)) return; if (wait_connection_availability(primary_conn, local_options.master_response_timeout) != 1) return; /* Get local xlog info */ sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP "); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strcpy(monitor_witness_timestamp, PQgetvalue(res, 0, 0)); PQclear(res); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " pg_current_xlog_location(), null, " " 0, 0)", repmgr_schema, primary_options.node, local_options.node, monitor_witness_timestamp); /* * Execute the query asynchronously, but don't check for a result. We will * check the result next time we pause for a monitor step. */ log_debug("witness_monitor: %s\n", sqlquery); if (PQsendQuery(primary_conn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primary_conn)); }