/* * get a connection to master by reading repl_nodes, creating a connection * to each node (one at a time) and finding if it is a master or a standby * * NB: If master_conninfo_out may be NULL. If it is non-null, it is assumed to * point to allocated memory of MAXCONNINFO in length, and the master server * connection string is placed there. */ PGconn * getMasterConnection(PGconn *standby_conn, char *schema, char *cluster, int *master_id, char *master_conninfo_out) { PGconn *master_conn = NULL; PGresult *res1; PGresult *res2; char sqlquery[QUERY_STR_LEN]; char master_conninfo_stack[MAXCONNINFO]; char *master_conninfo = &*master_conninfo_stack; char schema_quoted[MAXLEN]; int i; /* * If the caller wanted to get a copy of the connection info string, sub * out the local stack pointer for the pointer passed by the caller. */ if (master_conninfo_out != NULL) master_conninfo = master_conninfo_out; /* * XXX: This is copied in at least two other procedures * * Assemble the unquoted schema name */ { char *identifier = PQescapeIdentifier(standby_conn, schema, strlen(schema)); maxlen_snprintf(schema_quoted, "%s", identifier); PQfreemem(identifier); } /* find all nodes belonging to this cluster */ log_info(_("finding node list for cluster '%s'\n"), cluster); sqlquery_snprintf(sqlquery, "SELECT id, conninfo FROM %s.repl_nodes " " WHERE cluster = '%s' and not witness", schema_quoted, cluster); res1 = PQexec(standby_conn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(standby_conn)); PQclear(res1); PQfinish(standby_conn); exit(ERR_DB_QUERY); } for (i = 0; i < PQntuples(res1); i++) { /* initialize with the values of the current node being processed */ *master_id = atoi(PQgetvalue(res1, i, 0)); strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO); log_info(_("checking role of cluster node '%s'\n"), master_conninfo); master_conn = establishDBConnection(master_conninfo, false); if (PQstatus(master_conn) != CONNECTION_OK) continue; /* * Can't use the is_standby() function here because on error that * function closes the connection passed and exits. This still * needs to close master_conn first. */ res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()"); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_err(_("Can't get recovery state from this node: %s\n"), PQerrorMessage(master_conn)); PQclear(res2); PQfinish(master_conn); continue; } /* if false, this is the master */ if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0) { PQclear(res2); PQclear(res1); return master_conn; } else { /* if it is a standby clear info */ PQclear(res2); PQfinish(master_conn); *master_id = -1; } } /* If we finish this loop without finding a master then * we doesn't have the info or the master has failed (or we * reached max_connections or superuser_reserved_connections, * anything else I'm missing?). * * Probably we will need to check the error to know if we need * to start failover procedure or just fix some situation on the * standby. */ PQclear(res1); return NULL; }
static void do_standby_promote(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char script[QUERY_STR_LEN]; char myClusterName[MAXLEN]; int myLocalId = -1; char conninfo[MAXLEN]; PGconn *old_master_conn; int old_master_id; int r; char data_dir[MAXLEN]; char recovery_file_path[MAXLEN]; char recovery_done_path[MAXLEN]; char standby_version[MAXVERSIONSTR]; /* * Read the configuration file: repmgr.conf */ parse_config(config_file, myClusterName, &myLocalId, conninfo); if (myLocalId == -1) { fprintf(stderr, "Node information is missing. " "Check the configuration file.\n"); exit(1); } /* We need to connect to check configuration */ conn = establishDBConnection(conninfo, true); /* we need v9 or better */ pg_version(conn, standby_version); if (strcmp(standby_version, "") == 0) { PQfinish(conn); fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname); return; } /* Check we are in a standby node */ if (!is_standby(conn)) { fprintf(stderr, "repmgr: The command should be executed in a standby node\n"); return; } /* we also need to check if there isn't any master already */ old_master_conn = getMasterConnection(conn, myLocalId, myClusterName, &old_master_id); if (old_master_conn != NULL) { PQfinish(old_master_conn); fprintf(stderr, "There is a master already in this cluster"); return; } if (verbose) printf(_("\n%s: Promoting standby...\n"), progname); /* Get the data directory full path and the last subdirectory */ sprintf(sqlquery, "SELECT setting " " FROM pg_settings WHERE name = 'data_directory'"); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn)); PQclear(res); PQfinish(conn); return; } strcpy(data_dir, PQgetvalue(res, 0, 0)); PQclear(res); PQfinish(conn); sprintf(recovery_file_path, "%s/%s", data_dir, RECOVERY_FILE); sprintf(recovery_done_path, "%s/%s", data_dir, RECOVERY_DONE_FILE); rename(recovery_file_path, recovery_done_path); /* We assume the pg_ctl script is in the PATH */ sprintf(script, "pg_ctl -D %s -m fast restart", data_dir); r = system(script); if (r != 0) { fprintf(stderr, "Can't restart service\n"); return; } /* reconnect to check we got promoted */ /* * XXX i'm removing this because it gives an annoying message saying couldn't connect * but is just the server starting up * conn = establishDBConnection(conninfo, true); * if (is_standby(conn)) * fprintf(stderr, "\n%s: STANDBY PROMOTE failed, this is still a standby node.\n", progname); * else * fprintf(stderr, "\n%s: you should REINDEX any hash indexes you have.\n", progname); * PQfinish(conn); */ return; }
int main(int argc, char **argv) { static struct option long_options[] = { {"config", required_argument, NULL, 'f'}, {"verbose", no_argument, NULL, 'v'}, {"no-history", no_argument, NULL, 'H'}, {NULL, 0, NULL, 0} }; int optindex; int c; char standby_version[MAXVERSIONSTR]; progname = get_progname(argv[0]); if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { help(progname); exit(SUCCESS); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { printf("%s (PostgreSQL) " PG_VERSION "\n", progname); exit(SUCCESS); } } while ((c = getopt_long(argc, argv, "f:vH", long_options, &optindex)) != -1) { switch (c) { case 'f': config_file = optarg; break; case 'v': verbose = true; break; case 'H': /* no-history */ only_one_entry_desired = true; break; default: usage(); exit(ERR_BAD_CONFIG); } } setup_cancel_handler(); /* * Read the configuration file: repmgr.conf */ parse_config(config_file, &local_options); if (local_options.node == -1) { log_err("Node information is missing. " "Check the configuration file, or provide one if you have not done so.\n"); exit(ERR_BAD_CONFIG); } logger_init(progname, local_options.loglevel, local_options.logfacility); if (verbose) logger_min_verbose(LOG_INFO); snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name); log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo); myLocalConn = establishDBConnection(local_options.conninfo, true); /* should be v9 or better */ log_info(_("%s Connected to database, checking its state\n"), progname); pg_version(myLocalConn, standby_version); if (strcmp(standby_version, "") == 0) { PQfinish(myLocalConn); log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname); exit(ERR_BAD_CONFIG); } /* * Set my server mode, establish a connection to primary * and start monitor */ myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE; if (myLocalMode == PRIMARY_MODE) { primary_options.node = local_options.node; strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN); primaryConn = myLocalConn; } else { /* I need the id of the primary as well as a connection to it */ log_info(_("%s Connecting to primary for cluster '%s'\n"), progname, local_options.cluster_name); primaryConn = getMasterConnection(myLocalConn, local_options.node, local_options.cluster_name, &primary_options.node,NULL); if (primaryConn == NULL) { CloseConnections(); exit(ERR_BAD_CONFIG); } } checkClusterConfiguration(myLocalConn,primaryConn); checkNodeConfiguration(local_options.conninfo); if (myLocalMode == STANDBY_MODE) { log_info(_("%s Starting continuous standby node monitoring\n"), progname); MonitorCheck(); } else { log_info(_("%s This is a primary node, program not needed here; exiting'\n"), progname); } /* Prevent a double-free */ if (primaryConn == myLocalConn) myLocalConn = NULL; /* close the connection to the database and cleanup */ CloseConnections(); /* Shuts down logging system */ logger_shutdown(); return 0; }
static void do_master_register(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char myClusterName[MAXLEN]; int myLocalId = -1; char conninfo[MAXLEN]; bool schema_exists = false; char master_version[MAXVERSIONSTR]; /* * Read the configuration file: repmgr.conf */ parse_config(config_file, myClusterName, &myLocalId, conninfo); if (myLocalId == -1) { fprintf(stderr, "Node information is missing. " "Check the configuration file.\n"); exit(1); } conn = establishDBConnection(conninfo, true); /* master should be v9 or better */ pg_version(conn, master_version); if (strcmp(master_version, "") == 0) { PQfinish(conn); fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname); return; } /* Check we are a master */ if (is_standby(conn)) { fprintf(stderr, "repmgr: This node should be a master\n"); PQfinish(conn); return; } /* Check if there is a schema for this cluster */ sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get info about schemas: %s\n", PQerrorMessage(conn)); PQclear(res); PQfinish(conn); return; } if (PQntuples(res) > 0) /* schema exists */ { if (!force) /* and we are not forcing so error */ { fprintf(stderr, "Schema repmgr_%s already exists.", myClusterName); PQclear(res); PQfinish(conn); return; } schema_exists = true; } PQclear(res); if (!schema_exists) { /* ok, create the schema */ sprintf(sqlquery, "CREATE SCHEMA repmgr_%s", myClusterName); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot create the schema repmgr_%s: %s\n", myClusterName, PQerrorMessage(conn)); PQfinish(conn); return; } /* ... the tables */ sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_nodes ( " " id integer primary key, " " cluster text not null, " " conninfo text not null)", myClusterName); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot create the table repmgr_%s.repl_nodes: %s\n", myClusterName, PQerrorMessage(conn)); PQfinish(conn); return; } sprintf(sqlquery, "CREATE TABLE repmgr_%s.repl_monitor ( " " primary_node INTEGER NOT NULL, " " standby_node INTEGER NOT NULL, " " last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL, " " last_wal_primary_location TEXT NOT NULL, " " last_wal_standby_location TEXT NOT NULL, " " replication_lag BIGINT NOT NULL, " " apply_lag BIGINT NOT NULL) ", myClusterName); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot create the table repmgr_%s.repl_monitor: %s\n", myClusterName, PQerrorMessage(conn)); PQfinish(conn); return; } /* and the view */ sprintf(sqlquery, "CREATE VIEW repmgr_%s.repl_status AS " " WITH monitor_info AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY primary_node, standby_node " " ORDER BY last_monitor_time desc) " " FROM repmgr_%s.repl_monitor) " " SELECT primary_node, standby_node, last_monitor_time, last_wal_primary_location, " " last_wal_standby_location, pg_size_pretty(replication_lag) replication_lag, " " pg_size_pretty(apply_lag) apply_lag, age(now(), last_monitor_time) AS time_lag " " FROM monitor_info a " " WHERE row_number = 1", myClusterName, myClusterName); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot create the view repmgr_%s.repl_status: %s\n", myClusterName, PQerrorMessage(conn)); PQfinish(conn); return; } } else { PGconn *master_conn; int id; /* Ensure there isn't any other master already registered */ master_conn = getMasterConnection(conn, myLocalId, myClusterName, &id); if (master_conn != NULL) { PQfinish(master_conn); fprintf(stderr, "There is a master already in this cluster"); return; } } /* Now register the master */ if (force) { sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes " " WHERE id = %d", myClusterName, myLocalId); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot delete node details, %s\n", PQerrorMessage(conn)); PQfinish(conn); return; } } sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes " "VALUES (%d, '%s', '%s')", myClusterName, myLocalId, myClusterName, conninfo); if (!PQexec(conn, sqlquery)) { fprintf(stderr, "Cannot insert node details, %s\n", PQerrorMessage(conn)); PQfinish(conn); return; } PQfinish(conn); return; }
static void do_standby_register(void) { PGconn *conn; PGconn *master_conn; int master_id; PGresult *res; char sqlquery[QUERY_STR_LEN]; char myClusterName[MAXLEN]; int myLocalId = -1; char conninfo[MAXLEN]; char master_version[MAXVERSIONSTR]; char standby_version[MAXVERSIONSTR]; /* * Read the configuration file: repmgr.conf */ parse_config(config_file, myClusterName, &myLocalId, conninfo); if (myLocalId == -1) { fprintf(stderr, "Node information is missing. " "Check the configuration file.\n"); exit(1); } conn = establishDBConnection(conninfo, true); /* should be v9 or better */ pg_version(conn, standby_version); if (strcmp(standby_version, "") == 0) { PQfinish(conn); fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname); return; } /* Check we are a standby */ if (!is_standby(conn)) { fprintf(stderr, "repmgr: This node should be a standby\n"); PQfinish(conn); return; } /* Check if there is a schema for this cluster */ sprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = 'repmgr_%s'", myClusterName); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get info about tablespaces: %s\n", PQerrorMessage(conn)); PQclear(res); PQfinish(conn); return; } if (PQntuples(res) == 0) /* schema doesn't exists */ { fprintf(stderr, "Schema repmgr_%s doesn't exists.", myClusterName); PQclear(res); PQfinish(conn); return; } PQclear(res); /* check if there is a master in this cluster */ master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id); if (!master_conn) return; /* master should be v9 or better */ pg_version(master_conn, master_version); if (strcmp(master_version, "") == 0) { PQfinish(conn); PQfinish(master_conn); fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname); return; } /* master and standby version should match */ if (strcmp(master_version, standby_version) != 0) { PQfinish(conn); PQfinish(master_conn); fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"), progname, master_version, standby_version); return; } /* Now register the standby */ if (force) { sprintf(sqlquery, "DELETE FROM repmgr_%s.repl_nodes " " WHERE id = %d", myClusterName, myLocalId); if (!PQexec(master_conn, sqlquery)) { fprintf(stderr, "Cannot delete node details, %s\n", PQerrorMessage(master_conn)); PQfinish(master_conn); PQfinish(conn); return; } } sprintf(sqlquery, "INSERT INTO repmgr_%s.repl_nodes " "VALUES (%d, '%s', '%s')", myClusterName, myLocalId, myClusterName, conninfo); if (!PQexec(master_conn, sqlquery)) { fprintf(stderr, "Cannot insert node details, %s\n", PQerrorMessage(master_conn)); PQfinish(master_conn); PQfinish(conn); return; } PQfinish(master_conn); PQfinish(conn); return; }
static void do_standby_follow(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char script[QUERY_STR_LEN]; char myClusterName[MAXLEN]; int myLocalId = -1; char conninfo[MAXLEN]; PGconn *master_conn; int master_id; int r; char data_dir[MAXLEN]; char master_version[MAXVERSIONSTR]; char standby_version[MAXVERSIONSTR]; /* * Read the configuration file: repmgr.conf */ parse_config(config_file, myClusterName, &myLocalId, conninfo); if (myLocalId == -1) { fprintf(stderr, "Node information is missing. " "Check the configuration file.\n"); exit(1); } /* We need to connect to check configuration */ conn = establishDBConnection(conninfo, true); /* Check we are in a standby node */ if (!is_standby(conn)) { fprintf(stderr, "\n%s: The command should be executed in a standby node\n", progname); return; } /* should be v9 or better */ pg_version(conn, standby_version); if (strcmp(standby_version, "") == 0) { PQfinish(conn); fprintf(stderr, _("\n%s needs standby to be PostgreSQL 9.0 or better\n"), progname); return; } /* we also need to check if there is any master in the cluster */ master_conn = getMasterConnection(conn, myLocalId, myClusterName, &master_id); if (master_conn == NULL) { PQfinish(conn); fprintf(stderr, "There isn't a master to follow in this cluster"); return; } /* Check we are going to point to a master */ if (is_standby(master_conn)) { PQfinish(conn); fprintf(stderr, "%s: The node to follow should be a master\n", progname); return; } /* should be v9 or better */ pg_version(master_conn, master_version); if (strcmp(master_version, "") == 0) { PQfinish(conn); PQfinish(master_conn); fprintf(stderr, _("%s needs master to be PostgreSQL 9.0 or better\n"), progname); return; } /* master and standby version should match */ if (strcmp(master_version, standby_version) != 0) { PQfinish(conn); PQfinish(master_conn); fprintf(stderr, _("%s needs versions of both master (%s) and standby (%s) to match.\n"), progname, master_version, standby_version); return; } /* * set the host and masterport variables with the master ones * before closing the connection because we will need them to * recreate the recovery.conf file */ host = malloc(20); masterport = malloc(10); strcpy(host, PQhost(master_conn)); strcpy(masterport, PQport(master_conn)); PQfinish(master_conn); if (verbose) printf(_("\n%s: Changing standby's master...\n"), progname); /* Get the data directory full path */ sprintf(sqlquery, "SELECT setting " " FROM pg_settings WHERE name = 'data_directory'"); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get info about data directory: %s\n", PQerrorMessage(conn)); PQclear(res); PQfinish(conn); return; } strcpy(data_dir, PQgetvalue(res, 0, 0)); PQclear(res); PQfinish(conn); /* write the recovery.conf file */ if (!create_recovery_file(data_dir)) return; /* Finally, restart the service */ /* We assume the pg_ctl script is in the PATH */ sprintf(script, "pg_ctl -D %s -m fast restart", data_dir); r = system(script); if (r != 0) { fprintf(stderr, "Can't restart service\n"); return; } return; }
static void do_failover(void) { PGresult *res1; PGresult *res2; char sqlquery[8192]; int total_nodes = 0; int visible_nodes = 0; bool find_best = false; int i; int r; int node; char nodeConninfo[MAXLEN]; unsigned int uxlogid; unsigned int uxrecoff; char last_wal_standby_applied[MAXLEN]; PGconn *nodeConn = NULL; /* * will get info about until 50 nodes, * which seems to be large enough for most scenarios */ nodeInfo nodes[50]; nodeInfo best_candidate; /* first we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()"); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn)); PQclear(res1); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); exit(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res1, 0, 0)); /* * we sleep the monitor time + one second * we bet it should be enough for other repmgrd to update their own data */ sleep(SLEEP_MONITOR + 1); /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo " " FROM %s.repl_nodes " " WHERE id IN (SELECT standby_node FROM %s.repl_status) " " AND cluster = '%s' " " ORDER BY priority ", repmgr_schema, repmgr_schema, local_options.cluster_name); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res1); PQfinish(myLocalConn); exit(ERR_DB_QUERY); } /* ask for the locations */ for (i = 0; i < PQntuples(res1); i++) { node = atoi(PQgetvalue(res1, i, 0)); /* Initialize on false so if we can't reach this node we know that later */ nodes[i].is_ready = false; strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN); nodeConn = establishDBConnection(nodeConninfo, false); /* if we can't see the node just skip it */ if (PQstatus(nodeConn) != CONNECTION_OK) continue; sqlquery_snprintf(sqlquery, "SELECT repmgr_get_last_standby_location()"); res2 = PQexec(nodeConn, sqlquery); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn)); log_info(_("Connection details: %s\n"), nodeConninfo); PQclear(res2); PQfinish(nodeConn); continue; } visible_nodes++; if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0)); nodes[i].nodeId = node; nodes[i].xlog_location.xlogid = uxlogid; nodes[i].xlog_location.xrecoff = uxrecoff; nodes[i].is_ready = true; PQclear(res2); PQfinish(nodeConn); } PQclear(res1); /* Close the connection to this server */ PQfinish(myLocalConn); /* * total nodes that are registered, include master which is a node but was * not counted because it's not a standby */ total_nodes = i + 1; /* * am i on the group that should keep alive? * if i see less than half of total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); exit(ERR_FAILOVER_FAIL); } /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes - 1; i++) { if (!nodes[i].is_ready) continue; else if (!find_best) { /* start with the first ready node, and then move on to the next one */ best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogPtr */ /* * Nodes are retrieved ordered by priority, so if the current * best candidate is lower or equal to the next node's wal location * then assign next node as the new best candidate. */ if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.nodeId == local_options.node)) { if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else if (find_best) { if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.nodeId); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. * The follow command should take care of that. */ r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); exit(ERR_FAILOVER_FAIL); } /* and reconnect to the local database */ myLocalConn = establishDBConnection(local_options.conninfo, true); }
int main(int argc, char **argv) { static struct option long_options[] = { {"config", required_argument, NULL, 'f'}, {"verbose", no_argument, NULL, 'v'}, {NULL, 0, NULL, 0} }; int optindex; int c; char standby_version[MAXVERSIONSTR]; progname = get_progname(argv[0]); if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { help(progname); exit(SUCCESS); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { printf("%s (PostgreSQL) " PG_VERSION "\n", progname); exit(SUCCESS); } } while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1) { switch (c) { case 'f': config_file = optarg; break; case 'v': verbose = true; break; default: usage(); exit(ERR_BAD_CONFIG); } } setup_event_handlers(); /* * Read the configuration file: repmgr.conf */ parse_config(config_file, &local_options); if (local_options.node == -1) { log_err(_("Node information is missing. " "Check the configuration file, or provide one if you have not done so.\n")); exit(ERR_BAD_CONFIG); } logger_init(progname, local_options.loglevel, local_options.logfacility); if (verbose) logger_min_verbose(LOG_INFO); snprintf(repmgr_schema, MAXLEN, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, local_options.cluster_name); log_info(_("%s Connecting to database '%s'\n"), progname, local_options.conninfo); myLocalConn = establishDBConnection(local_options.conninfo, true); /* should be v9 or better */ log_info(_("%s Connected to database, checking its state\n"), progname); pg_version(myLocalConn, standby_version); if (strcmp(standby_version, "") == 0) { log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname); PQfinish(myLocalConn); exit(ERR_BAD_CONFIG); } /* * Set my server mode, establish a connection to primary * and start monitor */ if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node)) myLocalMode = WITNESS_MODE; else if (is_standby(myLocalConn)) myLocalMode = STANDBY_MODE; else /* is the master */ myLocalMode = PRIMARY_MODE; switch (myLocalMode) { case PRIMARY_MODE: primary_options.node = local_options.node; strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN); primaryConn = myLocalConn; checkClusterConfiguration(myLocalConn, primaryConn); checkNodeConfiguration(local_options.conninfo); if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); myLocalConn = establishDBConnection(local_options.conninfo, true); primaryConn = myLocalConn; update_registration(); } log_info(_("%s Starting continuous primary connection check\n"), progname); /* Check that primary is still alive, and standbies are sending info */ /* * Every SLEEP_MONITOR seconds, do master checks * XXX * Check that standbies are sending info */ for (;;) { if (CheckPrimaryConnection()) { /* CheckActiveStandbiesConnections(); CheckInactiveStandbies(); */ sleep(SLEEP_MONITOR); } else { /* XXX * May we do something more verbose ? */ exit (1); } if (got_SIGHUP) { /* if we can reload, then could need to change myLocalConn */ if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); myLocalConn = establishDBConnection(local_options.conninfo, true); primaryConn = myLocalConn; update_registration(); } got_SIGHUP = false; } } break; case WITNESS_MODE: case STANDBY_MODE: /* I need the id of the primary as well as a connection to it */ log_info(_("%s Connecting to primary for cluster '%s'\n"), progname, local_options.cluster_name); primaryConn = getMasterConnection(myLocalConn, repmgr_schema, local_options.node, local_options.cluster_name, &primary_options.node, NULL); if (primaryConn == NULL) { CloseConnections(); exit(ERR_BAD_CONFIG); } checkClusterConfiguration(myLocalConn, primaryConn); checkNodeConfiguration(local_options.conninfo); if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); myLocalConn = establishDBConnection(local_options.conninfo, true); update_registration(); } /* * Every SLEEP_MONITOR seconds, do checks */ if (myLocalMode == WITNESS_MODE) { log_info(_("%s Starting continuous witness node monitoring\n"), progname); } else if (myLocalMode == STANDBY_MODE) { log_info(_("%s Starting continuous standby node monitoring\n"), progname); } for (;;) { if (myLocalMode == WITNESS_MODE) WitnessMonitor(); else if (myLocalMode == STANDBY_MODE) StandbyMonitor(); sleep(SLEEP_MONITOR); if (got_SIGHUP) { /* if we can reload, then could need to change myLocalConn */ if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); myLocalConn = establishDBConnection(local_options.conninfo, true); update_registration(); } got_SIGHUP = false; } } break; default: log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node); } /* Prevent a double-free */ if (primaryConn == myLocalConn) myLocalConn = NULL; /* close the connection to the database and cleanup */ CloseConnections(); /* Shuts down logging system */ logger_shutdown(); return 0; }
bool reload_configuration(char *config_file, t_configuration_options *orig_options) { PGconn *conn; t_configuration_options new_options; /* * Re-read the configuration file: repmgr.conf */ log_info(_("Reloading configuration file and updating repmgr tables\n")); parse_config(config_file, &new_options); if (new_options.node == -1) { log_warning(_("\nCannot load new configuration, will keep current one.\n")); return false; } if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0) { log_warning(_("\nCannot change cluster name, will keep current configuration.\n")); return false; } if (new_options.node != orig_options->node) { log_warning(_("\nCannot change node number, will keep current configuration.\n")); return false; } if (new_options.node_name != orig_options->node_name) { log_warning(_("\nCannot change standby name, will keep current configuration.\n")); return false; } if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER) { log_warning(_("\nNew value for failover is not valid. Should be MANUAL or AUTOMATIC.\n")); return false; } if (new_options.master_response_timeout <= 0) { log_warning(_("\nNew value for master_response_timeout is not valid. Should be greater than zero.\n")); return false; } if (new_options.reconnect_attempts < 0) { log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n")); return false; } if (new_options.reconnect_intvl < 0) { log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n")); return false; } /* Test conninfo string */ conn = establishDBConnection(new_options.conninfo, false); if (!conn || (PQstatus(conn) != CONNECTION_OK)) { log_warning(_("\nconninfo string is not valid, will keep current configuration.\n")); return false; } PQfinish(conn); /* Configuration seems ok, will load new values */ strcpy(orig_options->cluster_name, new_options.cluster_name); orig_options->node = new_options.node; strcpy(orig_options->conninfo, new_options.conninfo); orig_options->failover = new_options.failover; orig_options->priority = new_options.priority; strcpy(orig_options->node_name, new_options.node_name); strcpy(orig_options->promote_command, new_options.promote_command); strcpy(orig_options->follow_command, new_options.follow_command); strcpy(orig_options->rsync_options, new_options.rsync_options); strcpy(orig_options->ssh_options, new_options.ssh_options); orig_options->master_response_timeout = new_options.master_response_timeout; orig_options->reconnect_attempts = new_options.reconnect_attempts; orig_options->reconnect_intvl = new_options.reconnect_intvl; /* * XXX These ones can change with a simple SIGHUP? strcpy (orig_options->loglevel, new_options.loglevel); strcpy (orig_options->logfacility, new_options.logfacility); logger_shutdown(); XXX do we have progname here ? logger_init(progname, orig_options.loglevel, orig_options.logfacility); */ return true; }
int main(int argc, char **argv) { static struct option long_options[] = { {"config", required_argument, NULL, 'f'}, {"verbose", no_argument, NULL, 'v'}, {NULL, 0, NULL, 0} }; int optindex; int c; char conninfo[MAXLEN]; char standby_version[MAXVERSIONSTR]; progname = get_progname(argv[0]); if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { help(progname); exit(0); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { printf("%s (PostgreSQL) " PG_VERSION "\n", progname); exit(0); } } while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1) { switch (c) { case 'f': config_file = optarg; break; case 'v': verbose = true; break; default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); } } setup_cancel_handler(); if (config_file == NULL) { config_file = malloc(5 + sizeof(CONFIG_FILE)); sprintf(config_file, "./%s", CONFIG_FILE); } /* * Read the configuration file: repmgr.conf */ parse_config(config_file, myClusterName, &myLocalId, conninfo); if (myLocalId == -1) { fprintf(stderr, "Node information is missing. " "Check the configuration file.\n"); exit(1); } myLocalConn = establishDBConnection(conninfo, true); /* should be v9 or better */ pg_version(myLocalConn, standby_version); if (strcmp(standby_version, "") == 0) { PQfinish(myLocalConn); fprintf(stderr, _("%s needs standby to be PostgreSQL 9.0 or better\n"), progname); exit(1); } /* * Set my server mode, establish a connection to primary * and start monitor */ myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE; if (myLocalMode == PRIMARY_MODE) { primaryId = myLocalId; strcpy(primaryConninfo, conninfo); primaryConn = myLocalConn; } else { /* I need the id of the primary as well as a connection to it */ primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId); if (primaryConn == NULL) exit(1); } checkClusterConfiguration(); checkNodeConfiguration(conninfo); if (myLocalMode == STANDBY_MODE) { MonitorCheck(); } /* close the connection to the database and cleanup */ CloseConnections(); return 0; }