static void checkNodeConfiguration(char *conninfo) { PGresult *res; /* * Check if we have my node information in repl_nodes */ log_info(_("%s Checking node %d in cluster '%s'\n"), progname, local_options.node, local_options.cluster_name); sqlquery_snprintf(sqlquery, "SELECT * FROM %s.repl_nodes " " WHERE id = %d AND cluster = '%s' ", repmgr_schema, local_options.node, local_options.cluster_name); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res); CloseConnections(); exit(ERR_BAD_CONFIG); } /* * If there isn't any results then we have not configured this node yet * in repmgr, if that is the case we will insert the node to the cluster, * except if it is a witness */ if (PQntuples(res) == 0) { PQclear(res); if (myLocalMode == WITNESS_MODE) { log_err(_("The witness is not configured\n")); CloseConnections(); exit(ERR_BAD_CONFIG); } /* Adding the node */ log_info(_("%s Adding node %d to cluster '%s'\n"), progname, local_options.node, local_options.cluster_name); sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes " "VALUES (%d, '%s', '%s', 'f')", repmgr_schema, local_options.node, local_options.cluster_name, local_options.conninfo); if (!PQexec(primaryConn, sqlquery)) { log_err(_("Cannot insert node details, %s\n"), PQerrorMessage(primaryConn)); CloseConnections(); exit(ERR_BAD_CONFIG); } } PQclear(res); }
/** * Just like guc_set except with an extra parameter containing the name of * the pg datatype so that the comparison can be done properly. */ int guc_set_typed(PGconn *conn, const char *parameter, const char *op, const char *value, const char *datatype) { PGresult *res; char sqlquery[QUERY_STR_LEN]; int retval = 1; sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings " " WHERE name = '%s' AND setting::%s %s '%s'::%s", parameter, datatype, op, value, datatype); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("GUC setting check PQexec failed: %s"), PQerrorMessage(conn)); retval = -1; } else if (PQntuples(res) == 0) { retval = 0; } PQclear(res); return retval; }
bool check_cluster_schema(PGconn *conn) { PGresult *res; char sqlquery[QUERY_STR_LEN]; sqlquery_snprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = '%s'", get_repmgr_schema()); log_debug(_("check_cluster_schema(): %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("check_cluster_schema(): unable to check cluster schema: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } if (PQntuples(res) == 0) { /* schema doesn't exist */ log_debug(_("check_cluster_schema(): schema '%s' doesn't exist\n"), get_repmgr_schema()); PQclear(res); return false; } PQclear(res); return true; }
bool update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id) { PGresult *res; char sqlquery[QUERY_STR_LEN]; log_debug(_("update_node_record_set_upstream(): Updating node %i's upstream node to %i\n"), this_node_id, new_upstream_node_id); sqlquery_snprintf(sqlquery, " UPDATE %s.repl_nodes " " SET upstream_node_id = %i " " WHERE cluster = '%s' " " AND id = %i ", get_repmgr_schema_quoted(conn), new_upstream_node_id, cluster_name, this_node_id); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Unable to set new upstream node id: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } PQclear(res); return true; }
bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_start_backup('repmgr_standby_clone_%ld', %s))", time(NULL), fast_checkpoint ? "TRUE" : "FALSE"); log_debug(_("standby clone: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("unable to start backup: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } if (first_wal_segment != NULL) { char *first_wal_seg_pq = PQgetvalue(res, 0, 0); size_t buf_sz = strlen(first_wal_seg_pq); first_wal_segment = pg_malloc0(buf_sz + 1); xsnprintf(first_wal_segment, buf_sz + 1, "%s", first_wal_seg_pq); } PQclear(res); return true; }
bool delete_node_record(PGconn *conn, int node, char *action) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_nodes " " WHERE id = %d", get_repmgr_schema_quoted(conn), node); if (action != NULL) { log_debug(_("%s: %s\n"), action, sqlquery); } res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_warning(_("Unable to delete node record: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } PQclear(res); return true; }
bool stop_backup(PGconn *conn, char *last_wal_segment) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_stop_backup())"); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("unable to stop backup: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } if (last_wal_segment != NULL) { char *last_wal_seg_pq = PQgetvalue(res, 0, 0); size_t buf_sz = strlen(last_wal_seg_pq); last_wal_segment = pg_malloc0(buf_sz + 1); xsnprintf(last_wal_segment, buf_sz + 1, "%s", last_wal_seg_pq); } PQclear(res); return true; }
static void checkClusterConfiguration(PGconn *conn, PGconn *primary) { PGresult *res; log_info(_("%s Checking cluster configuration with schema '%s'\n"), progname, repmgr_schema); sqlquery_snprintf(sqlquery, "SELECT oid FROM pg_class " " WHERE oid = '%s.repl_nodes'::regclass", repmgr_schema); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } /* * If there isn't any results then we have not configured a primary node * yet in repmgr or the connection string is pointing to the wrong * database. * * XXX if we are the primary, should we try to create the tables needed? */ if (PQntuples(res) == 0) { log_err("The replication cluster is not configured\n"); PQclear(res); CloseConnections(); exit(ERR_BAD_CONFIG); } PQclear(res); }
bool guc_setted(PGconn *conn, const char *parameter, const char *op, const char *value) { PGresult *res; char sqlquery[QUERY_STR_LEN]; sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings " " WHERE name = '%s' AND setting %s '%s'", parameter, op, value); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("GUC setting check PQexec failed: %s"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_DB_QUERY); } if (PQntuples(res) == 0) { PQclear(res); return false; } PQclear(res); return true; }
const char * get_cluster_size(PGconn *conn) { PGresult *res; const char *size; char sqlquery[QUERY_STR_LEN]; sqlquery_snprintf( sqlquery, "SELECT pg_size_pretty(SUM(pg_database_size(oid))::bigint) " " FROM pg_database "); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Get cluster size PQexec failed: %s"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_DB_QUERY); } size = PQgetvalue(res, 0, 0); PQclear(res); return size; }
bool get_cluster_size(PGconn *conn, char *size) { PGresult *res; char sqlquery[QUERY_STR_LEN]; sqlquery_snprintf( sqlquery, "SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) " " FROM pg_database "); res = PQexec(conn, sqlquery); if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("get_cluster_size(): PQexec failed: %s"), PQerrorMessage(conn)); PQclear(res); return false; } strncpy(size, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); return true; }
/* check the PQStatus and try to 'select 1' to confirm good connection */ bool is_pgup(PGconn *conn, int timeout) { char sqlquery[QUERY_STR_LEN]; /* Check the connection status twice in case it changes after reset */ bool twice = false; /* Check the connection status twice in case it changes after reset */ for (;;) { if (PQstatus(conn) != CONNECTION_OK) { if (twice) return false; PQreset(conn); /* reconnect */ twice = true; } else { /* * Send a SELECT 1 just to check if the connection is OK */ if (!cancel_query(conn, timeout)) goto failed; if (wait_connection_availability(conn, timeout) != 1) goto failed; sqlquery_snprintf(sqlquery, "SELECT 1"); if (PQsendQuery(conn, sqlquery) == 0) { log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"), PQerrorMessage(conn)); goto failed; } if (wait_connection_availability(conn, timeout) != 1) goto failed; break; failed: /* * we need to retry, because we might just have lost the * connection once */ if (twice) return false; PQreset(conn); /* reconnect */ twice = true; } } return true; }
bool get_pg_setting(PGconn *conn, const char *setting, char *output) { char sqlquery[QUERY_STR_LEN]; PGresult *res; int i; bool success = true; sqlquery_snprintf(sqlquery, "SELECT name, setting " " FROM pg_settings WHERE name = '%s'", setting); log_debug(_("get_pg_setting(): %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("get_pg_setting() - PQexec failed: %s"), PQerrorMessage(conn)); PQclear(res); return false; } for (i = 0; i < PQntuples(res); i++) { if (strcmp(PQgetvalue(res, i, 0), setting) == 0) { strncpy(output, PQgetvalue(res, i, 1), MAXLEN); success = true; break; } else { log_err(_("unknown parameter: %s"), PQgetvalue(res, i, 0)); } } if (success == true) { log_debug(_("get_pg_setting(): returned value is '%s'\n"), output); } PQclear(res); return success; }
static void update_registration(void) { PGresult *res; sqlquery_snprintf(sqlquery, "UPDATE %s.repl_nodes " " SET conninfo = '%s', " " priority = %d " " WHERE id = %d", repmgr_schema, local_options.conninfo, local_options.priority, local_options.node); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot update registration: %s\n"), PQerrorMessage(primaryConn)); CloseConnections(); exit(ERR_DB_CON); } PQclear(res); }
/* * Return the id of the active master node, or NODE_NOT_FOUND if no * record available. * * This reports the value stored in the database only and * does not verify whether the node is actually available */ int get_master_node_id(PGconn *conn, char *cluster) { char sqlquery[QUERY_STR_LEN]; PGresult *res; int retval; sqlquery_snprintf(sqlquery, "SELECT id " " FROM %s.repl_nodes " " WHERE cluster = '%s' " " AND type = 'master' " " AND active IS TRUE ", get_repmgr_schema_quoted(conn), cluster); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("get_master_node_id(): query failed\n%s\n"), PQerrorMessage(conn)); retval = NODE_NOT_FOUND; } else if (PQntuples(res) == 0) { log_warning(_("get_master_node_id(): no active primary found\n")); retval = NODE_NOT_FOUND; } else { retval = atoi(PQgetvalue(res, 0, 0)); } PQclear(res); return retval; }
bool is_witness(PGconn *conn, char *schema, char *cluster, int node_id) { PGresult *res; bool result = false; char sqlquery[QUERY_STR_LEN]; sqlquery_snprintf(sqlquery, "SELECT witness from %s.repl_nodes where cluster = '%s' and id = %d", schema, cluster, node_id); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't query server mode: %s"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_DB_QUERY); } if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0) result = true; PQclear(res); return result; }
bool set_config_bool(PGconn *conn, const char *config_param, bool state) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "SET %s TO %s", config_param, state ? "TRUE" : "FALSE"); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_COMMAND_OK) { log_err("unable to set '%s': %s\n", config_param, PQerrorMessage(conn)); PQclear(res); return false; } PQclear(res); return true; }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void StandbyMonitor(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds if (PQstatus(primaryConn) != CONNECTION_OK) { if (local_options.failover == MANUAL_FAILOVER) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node, local_options.cluster_name, &primary_options.node, NULL); if (PQstatus(primaryConn) == CONNECTION_OK) { /* Connected, we can continue the process so break the loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */ sleep(300); } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); exit(ERR_DB_CON); } } else if (local_options.failover == AUTOMATIC_FAILOVER) { /* * When we returns from this function we will have a new primary and * a new primaryConn */ do_failover(); } } /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); CloseConnections(); exit(ERR_PROMOTED); } /* * first check if there is a command being executed, * and if that is the case, cancel the query so i can * insert the current record */ if (PQisBusy(primaryConn) == 1) CancelQuery(); /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location()"); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(primaryConn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = walLocationToBytes(last_wal_primary_location); lsn_standby_received = walLocationToBytes(last_wal_standby_received); lsn_standby_applied = walLocationToBytes(last_wal_standby_applied); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); /* * Execute the query asynchronously, but don't check for a result. We * will check the result next time we pause for a monitor step. */ if (PQsendQuery(primaryConn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primaryConn)); }
/* * get a connection to master by reading repl_nodes, creating a connection * to each node (one at a time) and finding if it is a master or a standby * * NB: If master_conninfo_out may be NULL. If it is non-null, it is assumed to * point to allocated memory of MAXCONNINFO in length, and the master server * connection string is placed there. */ PGconn * getMasterConnection(PGconn *standby_conn, char *schema, char *cluster, int *master_id, char *master_conninfo_out) { PGconn *master_conn = NULL; PGresult *res1; PGresult *res2; char sqlquery[QUERY_STR_LEN]; char master_conninfo_stack[MAXCONNINFO]; char *master_conninfo = &*master_conninfo_stack; char schema_quoted[MAXLEN]; int i; /* * If the caller wanted to get a copy of the connection info string, sub * out the local stack pointer for the pointer passed by the caller. */ if (master_conninfo_out != NULL) master_conninfo = master_conninfo_out; /* * XXX: This is copied in at least two other procedures * * Assemble the unquoted schema name */ { char *identifier = PQescapeIdentifier(standby_conn, schema, strlen(schema)); maxlen_snprintf(schema_quoted, "%s", identifier); PQfreemem(identifier); } /* find all nodes belonging to this cluster */ log_info(_("finding node list for cluster '%s'\n"), cluster); sqlquery_snprintf(sqlquery, "SELECT id, conninfo FROM %s.repl_nodes " " WHERE cluster = '%s' and not witness", schema_quoted, cluster); res1 = PQexec(standby_conn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(standby_conn)); PQclear(res1); PQfinish(standby_conn); exit(ERR_DB_QUERY); } for (i = 0; i < PQntuples(res1); i++) { /* initialize with the values of the current node being processed */ *master_id = atoi(PQgetvalue(res1, i, 0)); strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO); log_info(_("checking role of cluster node '%s'\n"), master_conninfo); master_conn = establishDBConnection(master_conninfo, false); if (PQstatus(master_conn) != CONNECTION_OK) continue; /* * Can't use the is_standby() function here because on error that * function closes the connection passed and exits. This still * needs to close master_conn first. */ res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()"); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_err(_("Can't get recovery state from this node: %s\n"), PQerrorMessage(master_conn)); PQclear(res2); PQfinish(master_conn); continue; } /* if false, this is the master */ if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0) { PQclear(res2); PQclear(res1); return master_conn; } else { /* if it is a standby clear info */ PQclear(res2); PQfinish(master_conn); *master_id = -1; } } /* If we finish this loop without finding a master then * we doesn't have the info or the master has failed (or we * reached max_connections or superuser_reserved_connections, * anything else I'm missing?). * * Probably we will need to check the error to know if we need * to start failover procedure or just fix some situation on the * standby. */ PQclear(res1); return NULL; }
static void witness_monitor(void) { char monitor_witness_timestamp[MAXLEN]; PGresult *res; char sqlquery[QUERY_STR_LEN]; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, return false. */ check_connection(primary_conn, "master"); /* this take up to * local_options.reconnect_atte * mpts * * local_options.reconnect_intv * l seconds */ if (PQstatus(primary_conn) != CONNECTION_OK) { /* * If we can't reconnect, just exit... XXX we need to make witness * connect to the new master */ terminate(0); } /* Fast path for the case where no history is requested */ if (!monitoring_history) return; /* * Cancel any query that is still being executed, so i can insert the * current record */ if (!cancel_query(primary_conn, local_options.master_response_timeout)) return; if (wait_connection_availability(primary_conn, local_options.master_response_timeout) != 1) return; /* Get local xlog info */ sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP "); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strcpy(monitor_witness_timestamp, PQgetvalue(res, 0, 0)); PQclear(res); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " pg_current_xlog_location(), null, " " 0, 0)", repmgr_schema, primary_options.node, local_options.node, monitor_witness_timestamp); /* * Execute the query asynchronously, but don't check for a result. We will * check the result next time we pause for a monitor step. */ log_debug("witness_monitor: %s\n", sqlquery); if (PQsendQuery(primary_conn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primary_conn)); }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void standby_monitor(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; char last_wal_standby_applied_timestamp[MAXLEN]; char sqlquery[QUERY_STR_LEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries, ret; bool did_retry = false; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ check_connection(primary_conn, "master"); /* this take up to * local_options.reconnect_atte * mpts * * local_options.reconnect_intv * l seconds */ if (!check_connection(my_local_conn, "standby")) { log_err("Failed to connect to local node, exiting!\n"); terminate(1); } if (PQstatus(primary_conn) != CONNECTION_OK) { PQfinish(primary_conn); primary_conn = NULL; if (local_options.failover == MANUAL_FAILOVER) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primary_conn = get_master_connection(my_local_conn, repmgr_schema, local_options.cluster_name, &primary_options.node, NULL); if (PQstatus(primary_conn) == CONNECTION_OK) { /* * Connected, we can continue the process so break the * loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* * wait local_options.retry_promote_interval_secs minutes * before retries, after 6 failures (6 * * local_options.monitor_interval_secs seconds) we stop * trying */ sleep(local_options.retry_promote_interval_secs); } } if (PQstatus(primary_conn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); terminate(ERR_DB_CON); } } else if (local_options.failover == AUTOMATIC_FAILOVER) { /* * When we returns from this function we will have a new primary * and a new primary_conn */ do_failover(); return; } } /* Check if we still are a standby, we could have been promoted */ do { ret = is_standby(my_local_conn); switch (ret) { case 0: log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); terminate(1); break; case -1: log_err(_("Standby node disappeared, trying to reconnect...\n")); did_retry = true; if (!check_connection(my_local_conn, "standby")) { terminate(0); } break; } } while (ret == -1); if (did_retry) { log_info(_("standby connection got back up again!\n")); } /* Fast path for the case where no history is requested */ if (!monitoring_history) return; /* * Cancel any query that is still being executed, so i can insert the * current record */ if (!cancel_query(primary_conn, local_options.master_response_timeout)) return; if (wait_connection_availability(primary_conn, local_options.master_response_timeout) != 1) return; /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN); strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primary_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(primary_conn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = wal_location_to_bytes(last_wal_primary_location); lsn_standby_received = wal_location_to_bytes(last_wal_standby_received); lsn_standby_applied = wal_location_to_bytes(last_wal_standby_applied); /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s'::timestamp with time zone, '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_standby_applied_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); /* * Execute the query asynchronously, but don't check for a result. We will * check the result next time we pause for a monitor step. */ log_debug("standby_monitor: %s\n", sqlquery); if (PQsendQuery(primary_conn, sqlquery) == 0) log_warning(_("Query could not be sent to primary. %s\n"), PQerrorMessage(primary_conn)); }
static void do_failover(void) { PGresult *res; char sqlquery[QUERY_STR_LEN]; int total_nodes = 0; int visible_nodes = 0; int ready_nodes = 0; bool find_best = false; int i; int r; uint32 uxlogid; uint32 uxrecoff; XLogRecPtr xlog_recptr; char last_wal_standby_applied[MAXLEN]; PGconn *node_conn = NULL; /* * will get info about until 50 nodes, which seems to be large enough for * most scenarios */ t_node_info nodes[50]; /* initialize to keep compiler quiet */ t_node_info best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false}; /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo, witness " " FROM %s.repl_nodes " " WHERE cluster = '%s' " " ORDER BY priority, id ", repmgr_schema, local_options.cluster_name); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); terminate(ERR_DB_QUERY); } /* * total nodes that are registered */ total_nodes = PQntuples(res); log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes); /* * Build an array with the nodes and indicate which ones are visible and * ready */ for (i = 0; i < total_nodes; i++) { nodes[i].node_id = atoi(PQgetvalue(res, i, 0)); strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXLEN); nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false; /* * Initialize on false so if we can't reach this node we know that * later */ nodes[i].is_visible = false; nodes[i].is_ready = false; XLAssignValue(nodes[i].xlog_location, 0, 0); log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"), progname, nodes[i].node_id, nodes[i].conninfo_str, (nodes[i].is_witness) ? "true" : "false"); node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* if we can't see the node just skip it */ if (PQstatus(node_conn) != CONNECTION_OK) { if (node_conn != NULL) PQfinish(node_conn); continue; } visible_nodes++; nodes[i].is_visible = true; PQfinish(node_conn); } PQclear(res); log_debug(_("Total nodes counted: registered=%d, visible=%d\n"), total_nodes, visible_nodes); /* * am i on the group that should keep alive? if i see less than half of * total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); terminate(ERR_FAILOVER_FAIL); } /* Query all the nodes to determine which ones are ready */ for (i = 0; i < total_nodes; i++) { /* if the node is not visible, skip it */ if (!nodes[i].is_visible) continue; if (nodes[i].is_witness) continue; node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* * XXX This shouldn't happen, if this happens it means this is a major * problem maybe network outages? anyway, is better for a human to * react */ if (PQstatus(node_conn) != CONNECTION_OK) { log_err(_("It seems new problems are arising, manual intervention is needed\n")); terminate(ERR_FAILOVER_FAIL); } uxlogid = 0; uxrecoff = 0; sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()"); res = PQexec(node_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(node_conn)); log_info(_("Connection details: %s\n"), nodes[i].conninfo_str); PQclear(res); PQfinish(node_conn); terminate(ERR_FAILOVER_FAIL); } if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0)); log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n", nodes[i].node_id, uxlogid, uxlogid, uxrecoff, uxrecoff); /* If position is 0/0, error */ if (uxlogid == 0 && uxrecoff == 0) { PQclear(res); PQfinish(node_conn); log_info(_("InvalidXLogRecPtr detected in a standby\n")); terminate(ERR_FAILOVER_FAIL); } XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff); PQclear(res); PQfinish(node_conn); } /* last we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be " " considered as new primary and exit.\n"), PQerrorMessage(my_local_conn)); PQclear(res); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); terminate(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res, 0, 0)); PQclear(res); for (i = 0; i < total_nodes; i++) { while (!nodes[i].is_ready) { /* * the witness will always be masked as ready if it's still not * marked that way and avoid a useless query */ if (nodes[i].is_witness) { if (!nodes[i].is_ready) { nodes[i].is_ready = true; ready_nodes++; } break; } /* if the node is not visible, skip it */ if (!nodes[i].is_visible) break; /* if the node is ready there is nothing to check, skip it too */ if (nodes[i].is_ready) break; node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* * XXX This shouldn't happen, if this happens it means this is a * major problem maybe network outages? anyway, is better for a * human to react */ if (PQstatus(node_conn) != CONNECTION_OK) { /* XXX */ log_info(_("At this point, it could be some race conditions " "that are acceptable, assume the node is restarting " "and starting failover procedure\n")); break; } uxlogid = 0; uxrecoff = 0; sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema); res = PQexec(node_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not" "be considered as new primary and exit.\n"), PQerrorMessage(node_conn)); PQclear(res); PQfinish(node_conn); terminate(ERR_DB_QUERY); } if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) { log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0)); /* we can't do anything but fail at this point... */ if (*PQgetvalue(res, 0, 0) == '\0') { log_crit("Whoops, seems as if shared_preload_libraries=repmgr_funcs is not set!\n"); exit(ERR_BAD_CONFIG); } } PQclear(res); PQfinish(node_conn); /* If position is 0/0, keep checking */ if (uxlogid == 0 && uxrecoff == 0) continue; XLAssignValue(xlog_recptr, uxlogid, uxrecoff); if (XLByteLT(nodes[i].xlog_location, xlog_recptr)) { XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff); } log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n", nodes[i].node_id, uxlogid, uxlogid, uxrecoff, uxrecoff); ready_nodes++; nodes[i].is_ready = true; } } /* Close the connection to this server */ PQfinish(my_local_conn); my_local_conn = NULL; /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes; i++) { /* witness is never a good candidate */ if (nodes[i].is_witness) continue; if (!nodes[i].is_ready || !nodes[i].is_visible) continue; if (!find_best) { /* * start with the first ready node, and then move on to the next * one */ best_candidate.node_id = nodes[i].node_id; XLAssign(best_candidate.xlog_location, nodes[i].xlog_location); best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogRecPtr */ /* * Nodes are retrieved ordered by priority, so if the current best * candidate is lower than the next node's wal location then assign * next node as the new best candidate. */ if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.node_id = nodes[i].node_id; XLAssign(best_candidate.xlog_location, nodes[i].xlog_location); best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.node_id == local_options.node)) { if (best_candidate.is_witness) { log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"), progname); terminate(ERR_FAILOVER_FAIL); } /* wait */ sleep(5); if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); if (log_type == REPMGR_STDERR && *local_options.logfile) { fflush(stderr); } r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); terminate(ERR_BAD_CONFIG); } } else if (find_best) { /* wait */ sleep(10); if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.node_id); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. The follow command * should take care of that. */ if (log_type == REPMGR_STDERR && *local_options.logfile) { fflush(stderr); } r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); terminate(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); terminate(ERR_FAILOVER_FAIL); } /* to force it to re-calculate mode and master node */ failover_done = true; /* and reconnect to the local database */ my_local_conn = establish_db_connection(local_options.conninfo, true); }
/* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. * Also do the math to see how far are we in bytes for being uptodate */ static void MonitorExecute(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; char last_wal_primary_location[MAXLEN]; char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; unsigned long long int lsn_primary; unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; int connection_retries; /* * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ for (connection_retries = 0; connection_retries < 15; connection_retries++) { if (PQstatus(primaryConn) != CONNECTION_OK) { log_warning(_("Connection to master has been lost, trying to recover...\n")); /* wait 20 seconds between retries */ sleep(20); PQreset(primaryConn); } else { if (connection_retries > 0) { log_notice(_("Connection to master has been restored, continue monitoring.\n")); } break; } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n")); for (connection_retries = 0; connection_retries < 6; connection_retries++) { primaryConn = getMasterConnection(myLocalConn, local_options.node, local_options.cluster_name, &primary_options.node,NULL); if (PQstatus(primaryConn) == CONNECTION_OK) { /* Connected, we can continue the process so break the loop */ log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node); break; } else { log_err(_("We haven't found a new master, waiting before retry...\n")); /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */ sleep(300); } } } if (PQstatus(primaryConn) != CONNECTION_OK) { log_err(_("We couldn't reconnect for long enough, exiting...\n")); exit(ERR_DB_CON); } /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); CloseConnections(); exit(ERR_PROMOTED); } /* * first check if there is a command being executed, * and if that is the case, cancel the query so i can * insert the current record */ if (PQisBusy(primaryConn) == 1) CancelQuery(); /* Get local xlog info */ sqlquery_snprintf( sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " "pg_last_xlog_replay_location()"); res = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn)); PQclear(res); /* if there is any error just let it be and retry in next loop */ return; } strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN); strncpy(last_wal_standby_received , PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied , PQgetvalue(res, 0, 2), MAXLEN); PQclear(res); /* Get primary xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location() "); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(primaryConn)); PQclear(res); return; } strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); /* Calculate the lag */ lsn_primary = walLocationToBytes(last_wal_primary_location); lsn_standby_received = walLocationToBytes(last_wal_standby_received); lsn_standby_applied = walLocationToBytes(last_wal_standby_applied); if (only_one_entry && only_one_entry_desired) { sqlquery_snprintf(sqlquery, "UPDATE %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)" "WHERE primary_node=%d AND secondary_node=%d", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } if (PQntuples(res) != 1) { only_one_entry = false; } PQclear(res); } else { /* * Build and send insert */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_monitor " "VALUES(%d, %d, '%s'::timestamp with time zone, " " '%s', '%s', " " %lld, %lld)", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp, last_wal_primary_location, last_wal_standby_received, (lsn_primary - lsn_standby_received), (lsn_standby_received - lsn_standby_applied)); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } PQclear(res); if (only_one_entry_desired) { /* * Build the SQL to execute on primary */ sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor " "WHERE primary_node=%d AND standby_node=%d AND last_monitor_time < '%s'::timestamp with time zone", repmgr_schema, primary_options.node, local_options.node, monitor_standby_timestamp); res = PQexec(primaryConn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err("PQexec failed: %s\n", PQerrorMessage(conn)); PQclear(res); CloseConnections(); exit(ERR_DB_QUERY); } PQclear(res); only_one_entry = true; } } }
/* * copy_configuration() * * Copy records in master's `repl_nodes` table to witness database * * This is used by `repmgr` when setting up the witness database, and * `repmgrd` after a failover event occurs */ bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name) { char sqlquery[MAXLEN]; PGresult *res; int i; sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_nodes", get_repmgr_schema_quoted(witnessconn)); log_debug("copy_configuration: %s\n", sqlquery); res = PQexec(witnessconn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot clean node details in the witness, %s\n", PQerrorMessage(witnessconn)); return false; } sqlquery_snprintf(sqlquery, "SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name FROM %s.repl_nodes", get_repmgr_schema_quoted(masterconn)); res = PQexec(masterconn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get configuration from master: %s\n", PQerrorMessage(masterconn)); PQclear(res); return false; } for (i = 0; i < PQntuples(res); i++) { bool node_record_created; char *witness = PQgetvalue(res, i, 4); log_debug(_("copy_configuration(): %s\n"), witness); node_record_created = create_node_record(witnessconn, "copy_configuration", atoi(PQgetvalue(res, i, 0)), PQgetvalue(res, i, 1), strlen(PQgetvalue(res, i, 2)) ? atoi(PQgetvalue(res, i, 2)) : NO_UPSTREAM_NODE, cluster_name, PQgetvalue(res, i, 3), PQgetvalue(res, i, 4), atoi(PQgetvalue(res, i, 5)), strlen(PQgetvalue(res, i, 6)) ? PQgetvalue(res, i, 6) : NULL ); if (node_record_created == false) { fprintf(stderr, "Unable to copy node record to witness database: %s\n", PQerrorMessage(witnessconn)); return false; } } PQclear(res); return true; }
/* * create_node_record() * * Create an entry in the `repl_nodes` table. * * XXX we should pass the record parameters as a struct. */ bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name) { char sqlquery[QUERY_STR_LEN]; char upstream_node_id[MAXLEN]; char slot_name_buf[MAXLEN]; PGresult *res; if (upstream_node == NO_UPSTREAM_NODE) { /* * No explicit upstream node id provided for standby - attempt to * get primary node id */ if (strcmp(type, "standby") == 0) { int primary_node_id = get_master_node_id(conn, cluster_name); maxlen_snprintf(upstream_node_id, "%i", primary_node_id); } else { maxlen_snprintf(upstream_node_id, "%s", "NULL"); } } else { maxlen_snprintf(upstream_node_id, "%i", upstream_node); } if (slot_name != NULL && slot_name[0]) { maxlen_snprintf(slot_name_buf, "'%s'", slot_name); } else { maxlen_snprintf(slot_name_buf, "%s", "NULL"); } /* XXX convert to placeholder query */ sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes " " (id, type, upstream_node_id, cluster, " " name, conninfo, slot_name, priority) " "VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i) ", get_repmgr_schema_quoted(conn), node, type, upstream_node_id, cluster_name, node_name, conninfo, slot_name_buf, priority); if (action != NULL) { log_debug(_("%s: %s\n"), action, sqlquery); } res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_warning(_("Unable to create node record: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } PQclear(res); return true; }
bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details) { char sqlquery[QUERY_STR_LEN]; PGresult *res; char event_timestamp[MAXLEN] = ""; bool success = true; struct tm ts; /* Only attempt to write a record if a connection handle was provided. Also check that the repmgr schema has been properly intialised - if not it means no configuration file was provided, which can happen with e.g. `repmgr standby clone`, and we won't know which schema to write to. */ if (conn != NULL && strcmp(repmgr_schema, DEFAULT_REPMGR_SCHEMA_PREFIX) != 0) { int n_node_id = htonl(node_id); char *t_successful = successful ? "TRUE" : "FALSE"; const char *values[4] = { (char *)&n_node_id, event, t_successful, details }; int lengths[4] = { sizeof(n_node_id), 0, 0, 0 }; int binary[4] = {1, 0, 0, 0}; sqlquery_snprintf(sqlquery, " INSERT INTO %s.repl_events ( " " node_id, " " event, " " successful, " " details " " ) " " VALUES ($1, $2, $3, $4) " " RETURNING event_timestamp ", get_repmgr_schema_quoted(conn)); res = PQexecParams(conn, sqlquery, 4, NULL, values, lengths, binary, 0); if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) { log_warning(_("Unable to create event record: %s\n"), PQerrorMessage(conn)); success = false; } else { /* Store timestamp to send to the notification command */ strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN); log_debug(_("Event timestamp is: %s\n"), event_timestamp); } PQclear(res); } /* * If no database connection provided, or the query failed, generate a * current timestamp ourselves. This isn't quite the same * format as PostgreSQL, but is close enough for diagnostic use. */ if (!strlen(event_timestamp)) { time_t now; time(&now); ts = *localtime(&now); strftime(event_timestamp, MAXLEN, "%Y-%m-%d %H:%M:%S%z", &ts); } /* an event notification command was provided - parse and execute it */ if (strlen(options->event_notification_command)) { char parsed_command[MAXPGPATH]; const char *src_ptr; char *dst_ptr; char *end_ptr; int r; /* * If configuration option 'event_notifications' was provided, * check if this event is one of the ones listed; if not listed, * don't execute the notification script. * * (If 'event_notifications' was not provided, we assume the script * should be executed for all events). */ if (options->event_notifications.head != NULL) { EventNotificationListCell *cell; bool notify_ok = false; for (cell = options->event_notifications.head; cell; cell = cell->next) { if (strcmp(event, cell->event_type) == 0) { notify_ok = true; break; } } /* * Event type not found in the 'event_notifications' list - return early */ if (notify_ok == false) { log_debug(_("Not executing notification script for event type '%s'\n"), event); return success; } } dst_ptr = parsed_command; end_ptr = parsed_command + MAXPGPATH - 1; *end_ptr = '\0'; for(src_ptr = options->event_notification_command; *src_ptr; src_ptr++) { if (*src_ptr == '%') { switch (src_ptr[1]) { case 'n': /* %n: node id */ src_ptr++; snprintf(dst_ptr, end_ptr - dst_ptr, "%i", node_id); dst_ptr += strlen(dst_ptr); break; case 'e': /* %e: event type */ src_ptr++; strlcpy(dst_ptr, event, end_ptr - dst_ptr); dst_ptr += strlen(dst_ptr); break; case 'd': /* %d: details */ src_ptr++; if (details != NULL) { strlcpy(dst_ptr, details, end_ptr - dst_ptr); dst_ptr += strlen(dst_ptr); } break; case 's': /* %s: successful */ src_ptr++; strlcpy(dst_ptr, successful ? "1" : "0", end_ptr - dst_ptr); dst_ptr += strlen(dst_ptr); break; case 't': /* %: timestamp */ src_ptr++; strlcpy(dst_ptr, event_timestamp, end_ptr - dst_ptr); dst_ptr += strlen(dst_ptr); break; default: /* otherwise treat the % as not special */ if (dst_ptr < end_ptr) *dst_ptr++ = *src_ptr; break; } } else { if (dst_ptr < end_ptr) *dst_ptr++ = *src_ptr; } } *dst_ptr = '\0'; log_debug(_("Executing: %s\n"), parsed_command); r = system(parsed_command); if (r != 0) { log_warning(_("Unable to execute event notification command\n")); success = false; } } return success; }
bool create_replication_slot(PGconn *conn, char *slot_name) { char sqlquery[QUERY_STR_LEN]; PGresult *res; /* * Check whether slot exists already; if it exists and is active, that * means another active standby is using it, which creates an error situation; * if not we can reuse it as-is */ sqlquery_snprintf(sqlquery, "SELECT active, slot_type " " FROM pg_replication_slots " " WHERE slot_name = '%s' ", slot_name); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("unable to query pg_replication_slots: %s\n"), PQerrorMessage(conn)); PQclear(res); return false; } if (PQntuples(res)) { if (strcmp(PQgetvalue(res, 0, 1), "physical") != 0) { log_err(_("Slot '%s' exists and is not a physical slot\n"), slot_name); PQclear(res); } if (strcmp(PQgetvalue(res, 0, 0), "f") == 0) { PQclear(res); log_debug(_("Replication slot '%s' exists but is inactive; reusing\n"), slot_name); return true; } PQclear(res); log_err(_("Slot '%s' already exists as an active slot\n"), slot_name); return false; } sqlquery_snprintf(sqlquery, "SELECT * FROM pg_create_physical_replication_slot('%s')", slot_name); log_debug(_("create_replication_slot(): Creating slot '%s' on primary\n"), slot_name); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("unable to create slot '%s' on the primary node: %s\n"), slot_name, PQerrorMessage(conn)); PQclear(res); return false; } PQclear(res); return true; }
PGconn * get_master_connection(PGconn *standby_conn, char *cluster, int *master_id, char *master_conninfo_out) { PGconn *master_conn = NULL; PGresult *res1; PGresult *res2; char sqlquery[QUERY_STR_LEN]; char master_conninfo_stack[MAXCONNINFO]; char *master_conninfo = &*master_conninfo_stack; int i, node_id; if (master_id != NULL) { *master_id = NODE_NOT_FOUND; } /* find all nodes belonging to this cluster */ log_info(_("finding node list for cluster '%s'\n"), cluster); sqlquery_snprintf(sqlquery, "SELECT id, conninfo " " FROM %s.repl_nodes " " WHERE cluster = '%s' " " AND type != 'witness' ", get_repmgr_schema_quoted(standby_conn), cluster); res1 = PQexec(standby_conn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("unable to retrieve node records: %s\n"), PQerrorMessage(standby_conn)); PQclear(res1); return NULL; } for (i = 0; i < PQntuples(res1); i++) { /* initialize with the values of the current node being processed */ node_id = atoi(PQgetvalue(res1, i, 0)); strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO); log_info(_("checking role of cluster node '%i'\n"), node_id); master_conn = establish_db_connection(master_conninfo, false); if (PQstatus(master_conn) != CONNECTION_OK) continue; /* * Can't use the is_standby() function here because on error that * function closes the connection passed and exits. This still needs * to close master_conn first. */ res2 = PQexec(master_conn, "SELECT pg_catalog.pg_is_in_recovery()"); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_err(_("unable to retrieve recovery state from this node: %s\n"), PQerrorMessage(master_conn)); PQclear(res2); PQfinish(master_conn); continue; } /* if false, this is the master */ if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0) { PQclear(res2); PQclear(res1); log_debug(_("get_master_connection(): current master node is %i\n"), node_id); if (master_id != NULL) { *master_id = node_id; } return master_conn; } else { /* if it is a standby, clear info */ PQclear(res2); PQfinish(master_conn); } } /* * If we finish this loop without finding a master then we doesn't have * the info or the master has failed (or we reached max_connections or * superuser_reserved_connections, anything else I'm missing?). * * Probably we will need to check the error to know if we need to start * failover procedure or just fix some situation on the standby. */ PQclear(res1); return NULL; }
/* * get_upstream_connection() * * Returns connection to node's upstream node * * NOTE: will attempt to connect even if node is marked as inactive */ PGconn * get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id, int *upstream_node_id_ptr, char *upstream_conninfo_out) { PGconn *upstream_conn = NULL; PGresult *res; char sqlquery[QUERY_STR_LEN]; char upstream_conninfo_stack[MAXCONNINFO]; char *upstream_conninfo = &*upstream_conninfo_stack; /* * If the caller wanted to get a copy of the connection info string, sub * out the local stack pointer for the pointer passed by the caller. */ if (upstream_conninfo_out != NULL) upstream_conninfo = upstream_conninfo_out; sqlquery_snprintf(sqlquery, " SELECT un.conninfo, un.name, un.id " " FROM %s.repl_nodes un " "INNER JOIN %s.repl_nodes n " " ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)" " WHERE n.cluster = '%s' " " AND n.id = %i ", get_repmgr_schema_quoted(standby_conn), get_repmgr_schema_quoted(standby_conn), cluster, node_id); log_debug("get_upstream_connection(): %s\n", sqlquery); res = PQexec(standby_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("unable to get conninfo for upstream server: %s\n"), PQerrorMessage(standby_conn)); PQclear(res); return NULL; } if (!PQntuples(res)) { log_notice(_("no record found for upstream server")); PQclear(res); return NULL; } strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO); if (upstream_node_id_ptr != NULL) *upstream_node_id_ptr = atoi(PQgetvalue(res, 0, 1)); PQclear(res); log_debug("conninfo is: '%s'\n", upstream_conninfo); upstream_conn = establish_db_connection(upstream_conninfo, false); if (PQstatus(upstream_conn) != CONNECTION_OK) { log_err(_("unable to connect to upstream node: %s\n"), PQerrorMessage(upstream_conn)); return NULL; } return upstream_conn; }
static void do_failover(void) { PGresult *res1; PGresult *res2; char sqlquery[8192]; int total_nodes = 0; int visible_nodes = 0; bool find_best = false; int i; int r; int node; char nodeConninfo[MAXLEN]; unsigned int uxlogid; unsigned int uxrecoff; char last_wal_standby_applied[MAXLEN]; PGconn *nodeConn = NULL; /* * will get info about until 50 nodes, * which seems to be large enough for most scenarios */ nodeInfo nodes[50]; nodeInfo best_candidate; /* first we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()"); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn)); PQclear(res1); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); exit(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res1, 0, 0)); /* * we sleep the monitor time + one second * we bet it should be enough for other repmgrd to update their own data */ sleep(SLEEP_MONITOR + 1); /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo " " FROM %s.repl_nodes " " WHERE id IN (SELECT standby_node FROM %s.repl_status) " " AND cluster = '%s' " " ORDER BY priority ", repmgr_schema, repmgr_schema, local_options.cluster_name); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res1); PQfinish(myLocalConn); exit(ERR_DB_QUERY); } /* ask for the locations */ for (i = 0; i < PQntuples(res1); i++) { node = atoi(PQgetvalue(res1, i, 0)); /* Initialize on false so if we can't reach this node we know that later */ nodes[i].is_ready = false; strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN); nodeConn = establishDBConnection(nodeConninfo, false); /* if we can't see the node just skip it */ if (PQstatus(nodeConn) != CONNECTION_OK) continue; sqlquery_snprintf(sqlquery, "SELECT repmgr_get_last_standby_location()"); res2 = PQexec(nodeConn, sqlquery); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn)); log_info(_("Connection details: %s\n"), nodeConninfo); PQclear(res2); PQfinish(nodeConn); continue; } visible_nodes++; if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0)); nodes[i].nodeId = node; nodes[i].xlog_location.xlogid = uxlogid; nodes[i].xlog_location.xrecoff = uxrecoff; nodes[i].is_ready = true; PQclear(res2); PQfinish(nodeConn); } PQclear(res1); /* Close the connection to this server */ PQfinish(myLocalConn); /* * total nodes that are registered, include master which is a node but was * not counted because it's not a standby */ total_nodes = i + 1; /* * am i on the group that should keep alive? * if i see less than half of total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); exit(ERR_FAILOVER_FAIL); } /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes - 1; i++) { if (!nodes[i].is_ready) continue; else if (!find_best) { /* start with the first ready node, and then move on to the next one */ best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogPtr */ /* * Nodes are retrieved ordered by priority, so if the current * best candidate is lower or equal to the next node's wal location * then assign next node as the new best candidate. */ if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.nodeId == local_options.node)) { if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else if (find_best) { if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.nodeId); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. * The follow command should take care of that. */ r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); exit(ERR_FAILOVER_FAIL); } /* and reconnect to the local database */ myLocalConn = establishDBConnection(local_options.conninfo, true); }