/* Detach a node */ static int pool_detach_node(int node_id, bool gracefully) { if (!gracefully) { degenerate_backend_set_ex(&node_id, 1, true, false); return 0; } /* Check if the NODE DOWN can be executed on * the given node id. */ degenerate_backend_set_ex(&node_id, 1, true, true); /* * Wait until all frontends exit */ *InRecovery = RECOVERY_DETACH; /* This wiil ensure that new incoming * connection requests are blocked */ if (wait_connection_closed()) { /* wait timed out */ finish_recovery(); return -1; } pcp_worker_wakeup_request = 0; /* * Now all frontends have gone. Let's do failover. */ degenerate_backend_set_ex(&node_id, 1, true, false); /* * Wait for failover completed. */ while (!pcp_worker_wakeup_request) { struct timeval t = {1, 0}; select(0, NULL, NULL, NULL, &t); } pcp_worker_wakeup_request = 0; /* * Start to accept incoming connections and send SIGUSR2 to pgpool * parent to distribute SIGUSR2 all pgpool children. */ finish_recovery(); return 0; }
/* Promote a node */ static int pool_promote_node(int node_id, bool gracefully) { if (!gracefully) { promote_backend(node_id); /* send promote request */ return 0; } /* * Wait until all frontends exit */ *InRecovery = RECOVERY_PROMOTE; /* This wiil ensure that new incoming * connection requests are blocked */ if (wait_connection_closed()) { /* wait timed out */ finish_recovery(); return -1; } /* * Now all frontends have gone. Let's do failover. */ promote_backend(node_id); /* send promote request */ /* * Wait for failover completed. */ pcp_worker_wakeup_request = 0; while (!pcp_worker_wakeup_request) { struct timeval t = {1, 0}; select(0, NULL, NULL, NULL, &t); } pcp_worker_wakeup_request = 0; /* * Start to accept incoming connections and send SIGUSR2 to pgpool * parent to distribute SIGUSR2 all pgpool children. */ finish_recovery(); return 0; }
static int wd_send_response(int sock, WdPacket * recv_pack) { int rtn = WD_NG; WdInfo * p, *q; WdNodeInfo * node; WdLockInfo * lock; WdPacket send_packet; struct timeval tv; char pack_str[WD_MAX_PACKET_STRING]; int pack_str_len; char hash[(MD5_PASSWD_LEN+1)*2]; bool is_node_packet = false; if (recv_pack == NULL) { return rtn; } memset(&send_packet, 0, sizeof(WdPacket)); p = &(recv_pack->wd_body.wd_info); /* auhtentication */ if (strlen(pool_config->wd_authkey)) { /* calculate hash from packet */ pack_str_len = wd_packet_to_string(*recv_pack, pack_str, sizeof(pack_str)); wd_calc_hash(pack_str, pack_str_len, hash); if (strcmp(recv_pack->hash, hash)) { pool_log("wd_send_response: watchdog authentication failed"); rtn = wd_authentication_failed(sock); return rtn; } } /* set response packet no */ switch (recv_pack->packet_no) { /* add request into the watchdog list */ case WD_ADD_REQ: p = &(recv_pack->wd_body.wd_info); if (wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status) > 0) { send_packet.packet_no = WD_ADD_ACCEPT; } else { send_packet.packet_no = WD_ADD_REJECT; } memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; /* announce candidacy to be the new master */ case WD_STAND_FOR_MASTER: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); /* check exist master */ if ((q = wd_is_alive_master()) != NULL) { /* vote against the candidate */ send_packet.packet_no = WD_MASTER_EXIST; memcpy(&(send_packet.wd_body.wd_info), q, sizeof(WdInfo)); } else { if (WD_MYSELF->tv.tv_sec <= p->tv.tv_sec ) { memcpy(&tv,&(p->tv),sizeof(struct timeval)); tv.tv_sec += 1; wd_set_myself(&tv, WD_NORMAL); } /* vote for the candidate */ send_packet.packet_no = WD_VOTE_YOU; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); } break; /* announce assumption to be the new master */ case WD_DECLARE_NEW_MASTER: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); if (WD_MYSELF->status == WD_MASTER) { /* resign master server */ pool_log("wd_declare_new_master: ifconfig down to resign master server"); wd_IP_down(); wd_set_myself(NULL, WD_NORMAL); } send_packet.packet_no = WD_READY; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; /* annouce to assume lock holder */ case WD_STAND_FOR_LOCK_HOLDER: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); /* only master handles lock holder privilege */ if (WD_MYSELF->status == WD_MASTER) { /* if theare are no lock holder yet */ if (wd_get_lock_holder() != NULL) { send_packet.packet_no = WD_LOCK_HOLDER_EXIST; } } memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; case WD_DECLARE_LOCK_HOLDER: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); wd_set_lock_holder(p, true); send_packet.packet_no = WD_READY; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; /* annouce to resigne lock holder */ case WD_RESIGN_LOCK_HOLDER: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); wd_set_lock_holder(p, false); send_packet.packet_no = WD_READY; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; case WD_START_INTERLOCK: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); wd_set_interlocking(p, true); break; case WD_END_INTERLOCK: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), p->status); wd_set_interlocking(p, false); break; /* announce that server is down */ case WD_SERVER_DOWN: p = &(recv_pack->wd_body.wd_info); wd_set_wd_list(p->hostname,p->pgpool_port, p->wd_port, p->delegate_ip, &(p->tv), WD_DOWN); send_packet.packet_no = WD_READY; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); if (wd_am_I_oldest() == WD_OK && WD_MYSELF->status != WD_MASTER) { wd_escalation(); } break; /* announce start online recovery */ case WD_START_RECOVERY: if (*InRecovery != RECOVERY_INIT) { send_packet.packet_no = WD_NODE_FAILED; } else { send_packet.packet_no = WD_NODE_READY; *InRecovery = RECOVERY_ONLINE; if (wait_connection_closed() != 0) { send_packet.packet_no = WD_NODE_FAILED; } } break; case WD_END_RECOVERY: send_packet.packet_no = WD_NODE_READY; *InRecovery = RECOVERY_INIT; kill(wd_ppid, SIGUSR2); break; case WD_FAILBACK_REQUEST: node = &(recv_pack->wd_body.wd_node_info); wd_set_node_mask(WD_FAILBACK_REQUEST,node->node_id_set,node->node_num); is_node_packet = true; send_packet.packet_no = WD_NODE_READY; break; case WD_DEGENERATE_BACKEND: node = &(recv_pack->wd_body.wd_node_info); wd_set_node_mask(WD_DEGENERATE_BACKEND,node->node_id_set, node->node_num); is_node_packet = true; send_packet.packet_no = WD_NODE_READY; break; case WD_PROMOTE_BACKEND: node = &(recv_pack->wd_body.wd_node_info); wd_set_node_mask(WD_PROMOTE_BACKEND,node->node_id_set, node->node_num); is_node_packet = true; send_packet.packet_no = WD_NODE_READY; break; case WD_UNLOCK_REQUEST: lock = &(recv_pack->wd_body.wd_lock_info); wd_set_lock(lock->lock_id, false); send_packet.packet_no = WD_LOCK_READY; break; default: send_packet.packet_no = WD_INVALID; memcpy(&(send_packet.wd_body.wd_info), WD_MYSELF, sizeof(WdInfo)); break; } /* send response packet */ rtn = wd_send_packet(sock, &send_packet); /* send node request signal. * wd_node_request_singnal() uses a semaphore lock internally, so should be * called after sending a response pakcet to prevent dead lock. */ if (is_node_packet) wd_node_request_signal(recv_pack->packet_no, node); return rtn; }
/* * Start online recovery. * "recovery_node" is the node to be recovered. * Master or primary node is chosen in this function. */ void start_recovery(int recovery_node) { int node_id; BackendInfo *backend; BackendInfo *recovery_backend; PGconn *conn; int failback_wait_count; #define FAILBACK_WAIT_MAX_RETRY 5 /* 5 seconds should be enough for failback operation */ ereport(LOG, (errmsg("starting recovering node %d", recovery_node))); if ( (recovery_node < 0) || (recovery_node >= pool_config->backend_desc->num_backends) ) ereport(ERROR, (errmsg("node recovery failed, node id: %d is not valid", recovery_node))); if (VALID_BACKEND(recovery_node)) ereport(ERROR, (errmsg("node recovery failed, node id: %d is alive", recovery_node))); /* select master/primary node */ node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID; backend = &pool_config->backend_desc->backend_info[node_id]; /* get node info to be recovered */ recovery_backend = &pool_config->backend_desc->backend_info[recovery_node]; conn = connect_backend_libpq(backend); if (conn == NULL) ereport(ERROR, (errmsg("node recovery failed, unable to connect to master node: %d ", node_id))); PG_TRY(); { /* 1st stage */ if (REPLICATION) { exec_checkpoint(conn); ereport(LOG, (errmsg("node recovery, CHECKPOINT in the 1st stage done"))); } exec_recovery(conn, backend, recovery_backend, FIRST_STAGE); ereport(LOG, (errmsg("node recovery, 1st stage is done"))); if (REPLICATION) { ereport(LOG, (errmsg("node recovery, starting 2nd stage"))); /* 2nd stage */ *InRecovery = RECOVERY_ONLINE; if (pool_config->use_watchdog) { /* announce start recovery */ if (WD_OK != wd_start_recovery()) ereport(ERROR, (errmsg("node recovery failed, failed to send start recovery packet"))); } if (wait_connection_closed() != 0) ereport(ERROR, (errmsg("node recovery failed, waiting connection closed in the other pgpools timeout"))); ereport(LOG, (errmsg("node recovery, all connections from clients have been closed"))); exec_checkpoint(conn); ereport(LOG, (errmsg("node recovery"), errdetail("CHECKPOINT in the 2nd stage done"))); exec_recovery(conn, backend, recovery_backend, SECOND_STAGE); } exec_remote_start(conn, recovery_backend); check_postmaster_started(recovery_backend); ereport(LOG, (errmsg("node recovery, node: %d restarted", recovery_node))); /* * reset failover completion flag. this is necessary since * previous failover/failback will set the flag to 1. */ pcp_wakeup_request = 0; /* send failback request to pgpool parent */ send_failback_request(recovery_node); /* wait for failback */ failback_wait_count = 0; while (!pcp_wakeup_request) { struct timeval t = {1, 0}; /* polling SIGUSR2 signal every 1 sec */ select(0, NULL, NULL, NULL, &t); failback_wait_count++; if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY) { ereport(LOG, (errmsg("node recovery"), errdetail("waiting for wake up request is timeout(%d seconds)", FAILBACK_WAIT_MAX_RETRY))); break; } } pcp_wakeup_request = 0; } PG_CATCH(); { PQfinish(conn); PG_RE_THROW(); } PG_END_TRY(); PQfinish(conn); ereport(LOG, (errmsg("recovery done"))); }
/* * Start online recovery. * "recovery_node" is the node to be recovered. * Master or primary node is chosen in this function. */ int start_recovery(int recovery_node) { int node_id; BackendInfo *backend; BackendInfo *recovery_backend; PGconn *conn; int failback_wait_count; #define FAILBACK_WAIT_MAX_RETRY 5 /* 5 seconds should be enough for failback operation */ pool_log("starting recovering node %d", recovery_node); if (VALID_BACKEND(recovery_node)) { pool_error("start_recovery: backend node %d is alive", recovery_node); return 1; } Req_info->kind = NODE_RECOVERY_REQUEST; /* select master/primary node */ node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID; backend = &pool_config->backend_desc->backend_info[node_id]; /* get node info to be recovered */ recovery_backend = &pool_config->backend_desc->backend_info[recovery_node]; conn = connect_backend_libpq(backend); if (conn == NULL) { pool_error("start_recovery: could not connect master node (%d)", node_id); return 1; } /* 1st stage */ if (REPLICATION) { if (exec_checkpoint(conn) != 0) { PQfinish(conn); pool_error("start_recovery: CHECKPOINT failed"); return 1; } pool_log("CHECKPOINT in the 1st stage done"); } if (exec_recovery(conn, recovery_backend, FIRST_STAGE) != 0) { PQfinish(conn); return 1; } pool_log("1st stage is done"); if (REPLICATION) { pool_log("starting 2nd stage"); /* 2nd stage */ *InRecovery = RECOVERY_ONLINE; if (pool_config->use_watchdog) { /* announce start recovery */ if (WD_OK != wd_start_recovery()) { PQfinish(conn); pool_error("start_recovery: timeover for waiting connection closed in the other pgpools"); return 1; } } if (wait_connection_closed() != 0) { PQfinish(conn); pool_error("start_recovery: timeover for waiting connection closed"); return 1; } pool_log("all connections from clients have been closed"); if (exec_checkpoint(conn) != 0) { PQfinish(conn); pool_error("start_recovery: CHECKPOINT failed"); return 1; } pool_log("CHECKPOINT in the 2nd stage done"); if (exec_recovery(conn, recovery_backend, SECOND_STAGE) != 0) { PQfinish(conn); return 1; } } if (exec_remote_start(conn, recovery_backend) != 0) { PQfinish(conn); pool_error("start_recovery: remote start failed"); return 1; } if (check_postmaster_started(recovery_backend)) { PQfinish(conn); pool_error("start_recovery: check start failed"); return 1; } pool_log("%d node restarted", recovery_node); /* * reset failover completion flag. this is necessary since * previous failover/failback will set the flag to 1. */ pcp_wakeup_request = 0; /* send failback request to pgpool parent */ send_failback_request(recovery_node); /* wait for failback */ failback_wait_count = 0; while (!pcp_wakeup_request) { struct timeval t = {1, 0}; /* polling SIGUSR2 signal every 1 sec */ select(0, NULL, NULL, NULL, &t); failback_wait_count++; if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY) { pool_log("start_recovery: waiting for wake up request is timeout(%d seconds)", FAILBACK_WAIT_MAX_RETRY); break; } } pcp_wakeup_request = 0; PQfinish(conn); pool_log("recovery done"); return 0; }