/* notice backend connection error using SIGUSR1 */ void degenerate_backend_set(int *node_id_set, int count) { pid_t parent = getppid(); int i; if (pool_config->parallel_mode) { return; } pool_semaphore_lock(REQUEST_INFO_SEM); Req_info->kind = NODE_DOWN_REQUEST; for (i = 0; i < count; i++) { if (node_id_set[i] < 0 || node_id_set[i] >= MAX_NUM_BACKENDS || !VALID_BACKEND(node_id_set[i])) { pool_log("notice_backend_error: node %d is not valid backend.", i); continue; } pool_log("notice_backend_error: %d fail over request from pid %d", node_id_set[i], getpid()); Req_info->node_id[i] = node_id_set[i]; } kill(parent, SIGUSR1); pool_semaphore_unlock(REQUEST_INFO_SEM); }
POOL_STATUS NotificationResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { int pid, pid1; char *condition, *condition1 = NULL; int len, len1 = 0; int i; POOL_STATUS status; pool_write(frontend, "A", 1); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { if (pool_read(CONNECTION(backend, i), &pid, sizeof(pid)) < 0) return POOL_ERROR; condition = pool_read_string(CONNECTION(backend, i), &len, 0); if (condition == NULL) return POOL_END; if (IS_MASTER_NODE_ID(i)) { pid1 = pid; len1 = len; condition1 = strdup(condition); } } } pool_write(frontend, &pid1, sizeof(pid1)); status = pool_write_and_flush(frontend, condition1, len1); free(condition1); return status; }
/* * Establish persistent connection to backend */ static void establish_persistent_connection(void) { int i; BackendInfo *bkinfo; POOL_CONNECTION_POOL_SLOT *s; for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (slots[i] == NULL) { bkinfo = pool_get_node_info(i); s = make_persistent_db_connection(bkinfo->backend_hostname, bkinfo->backend_port, "postgres", pool_config->sr_check_user, pool_config->sr_check_password, true); if (s) slots[i] = s; else slots[i] = NULL; } } }
/* * disconnect and release a connection to the database */ void pool_discard_cp(char *user, char *database, int protoMajor) { POOL_CONNECTION_POOL *p = pool_get_cp(user, database, protoMajor, 0); ConnectionInfo *info; int i, freed = 0; if (p == NULL) { pool_error("pool_discard_cp: cannot get connection pool for user %s datbase %s", user, database); return; } for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (!freed) { pool_free_startup_packet(CONNECTION_SLOT(p, i)->sp); freed = 1; } pool_close(CONNECTION(p, i)); free(CONNECTION_SLOT(p, i)); } info = p->info; memset(p, 0, sizeof(POOL_CONNECTION_POOL)); p->info = info; memset(p->info, 0, sizeof(ConnectionInfo) * MAX_NUM_BACKENDS); }
POOL_STATUS NoticeResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { char *string = NULL; int len; int i; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* read notice message */ string = pool_read_string(CONNECTION(backend, i), &len, 0); if (string == NULL) return POOL_END; } } /* forward to the frontend */ pool_write(frontend, "N", 1); if (pool_write_and_flush(frontend, string, len) < 0) { return POOL_END; } return POOL_CONTINUE; }
POOL_STATUS CompletedResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { int i; char *string = NULL; char *string1 = NULL; int len, len1 = 0; /* read command tag */ string = pool_read_string(MASTER(backend), &len, 0); if (string == NULL) return POOL_END; else if (!strncmp(string, "BEGIN", 5)) TSTATE(backend, MASTER_NODE_ID) = 'T'; else if (!strncmp(string, "COMMIT", 6) || !strncmp(string, "ROLLBACK", 8)) TSTATE(backend, MASTER_NODE_ID) = 'I'; len1 = len; string1 = strdup(string); for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i) || IS_MASTER_NODE_ID(i)) continue; /* read command tag */ string = pool_read_string(CONNECTION(backend, i), &len, 0); if (string == NULL) return POOL_END; else if (!strncmp(string, "BEGIN", 5)) TSTATE(backend, i) = 'T'; else if (!strncmp(string, "COMMIT", 6) || !strncmp(string, "ROLLBACK", 8)) TSTATE(backend, i) = 'I'; if (len != len1) { pool_debug("CompletedResponse: message length does not match between master(%d \"%s\",) and %d th server (%d \"%s\",)", len, string, i, len1, string1); /* we except INSERT, because INSERT response has OID */ if (strncmp(string1, "INSERT", 6)) { free(string1); return POOL_END; } } } /* forward to the frontend */ pool_write(frontend, "C", 1); pool_debug("CompletedResponse: string: \"%s\"", string1); if (pool_write(frontend, string1, len1) < 0) { free(string1); return POOL_END; } free(string1); return pool_flush(frontend); }
POOL_STATUS ErrorResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { char *string = NULL; int len; int i; POOL_STATUS ret = POOL_CONTINUE; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* read error message */ string = pool_read_string(CONNECTION(backend, i), &len, 0); if (string == NULL) return POOL_END; } } /* forward to the frontend */ pool_write(frontend, "E", 1); if (pool_write_and_flush(frontend, string, len) < 0) return POOL_END; /* * check session context, because this function is called * by pool_do_auth too. */ if (pool_get_session_context()) ret = raise_intentional_error_if_need(backend); /* change transaction state */ for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { if (TSTATE(backend, i) == 'T') TSTATE(backend, i) = 'E'; } } return ret; }
static POOL_CONNECTION_POOL *connect_backend(StartupPacket *sp, POOL_CONNECTION *frontend) { POOL_CONNECTION_POOL *backend; int i; /* connect to the backend */ backend = pool_create_cp(); if (backend == NULL) { pool_send_error_message(frontend, sp->major, "XX000", "connection cache is full", "", "increase max_pool", __FILE__, __LINE__); pool_close(frontend); pool_free_startup_packet(sp); return NULL; } for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* set DB node id */ CONNECTION(backend, i)->db_node_id = i; /* mark this is a backend connection */ CONNECTION(backend, i)->isbackend = 1; pool_ssl_negotiate_clientserver(CONNECTION(backend, i)); /* * save startup packet info */ CONNECTION_SLOT(backend, i)->sp = sp; /* send startup packet */ if (send_startup_packet(CONNECTION_SLOT(backend, i)) < 0) { pool_error("do_child: fails to send startup packet to the %d th backend", i); pool_discard_cp(sp->user, sp->database, sp->major); pool_close(frontend); return NULL; } } } /* * do authentication stuff */ if (pool_do_auth(frontend, backend)) { pool_close(frontend); pool_discard_cp(sp->user, sp->database, sp->major); return NULL; } return backend; }
/* * Select load balancing node */ int select_load_balancing_node(void) { int selected_slot; double total_weight,r; int i; /* choose a backend in random manner with weight */ selected_slot = MASTER_NODE_ID; total_weight = 0.0; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { total_weight += BACKEND_INFO(i).backend_weight; } } #if defined(sun) || defined(__sun) r = (((double)rand())/RAND_MAX) * total_weight; #else r = (((double)random())/RAND_MAX) * total_weight; #endif total_weight = 0.0; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i) && BACKEND_INFO(i).backend_weight > 0.0) { if(r >= total_weight) selected_slot = i; else break; total_weight += BACKEND_INFO(i).backend_weight; } } pool_debug("select_load_balancing_node: selected backend id is %d", selected_slot); return selected_slot; }
POOL_STATUS CursorResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { char *string = NULL; char *string1 = NULL; int len, len1 = 0; int i; /* read cursor name */ string = pool_read_string(MASTER(backend), &len, 0); if (string == NULL) return POOL_END; len1 = len; string1 = strdup(string); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i) && !IS_MASTER_NODE_ID(i)) { /* read cursor name */ string = pool_read_string(CONNECTION(backend, i), &len, 0); if (string == NULL) return POOL_END; if (len != len1) { pool_error("CursorResponse: length does not match between master(%d) and %d th backend(%d)", len, i, len1); pool_error("CursorResponse: master(%s) %d th backend(%s)", string1, i, string); free(string1); return POOL_END; } } } /* forward to the frontend */ pool_write(frontend, "P", 1); if (pool_write(frontend, string1, len1) < 0) { free(string1); return POOL_END; } free(string1); if (pool_flush(frontend)) return POOL_END; return POOL_CONTINUE; }
/* send failback request using SIGUSR1 */ void send_failback_request(int node_id) { pid_t parent = getppid(); pool_log("send_failback_request: fail back %d th node request from pid %d", node_id, getpid()); Req_info->kind = NODE_UP_REQUEST; Req_info->node_id[0] = node_id; if (node_id < 0 || node_id >= MAX_NUM_BACKENDS || VALID_BACKEND(node_id)) { pool_error("send_failback_request: node %d is alive.", node_id); return; } kill(parent, SIGUSR1); }
POOL_STATUS EmptyQueryResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { char c; int i; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { if (pool_read(CONNECTION(backend, i), &c, sizeof(c)) < 0) return POOL_END; } } pool_write(frontend, "I", 1); return pool_write_and_flush(frontend, "", 1); }
/* * Unset frontend connected flag */ void pool_coninfo_unset_frontend_connected(int proc_id, int pool_index) { ConnectionInfo *con; int i; for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; con = pool_coninfo(proc_id, pool_index, i); if (con == NULL) { elog(WARNING,"failed to get connection info while marking the frontend is not connected for pool"); return; } con->connected = false; } }
/* * signal handler for SIGUSR1 * close all idle connections */ static RETSIGTYPE close_idle_connection(int sig) { int i, j; POOL_CONNECTION_POOL *p = pool_connection_pool; ConnectionInfo *info; pool_debug("child receives close connection request"); for (j=0;j<pool_config->max_pool;j++, p++) { if (!MASTER_CONNECTION(p)) continue; if (!MASTER_CONNECTION(p)->sp) continue; if (MASTER_CONNECTION(p)->sp->user == NULL) continue; if (MASTER_CONNECTION(p)->closetime > 0) /* idle connection? */ { pool_debug("close_idle_connection: close idle connection: user %s database %s", MASTER_CONNECTION(p)->sp->user, MASTER_CONNECTION(p)->sp->database); pool_send_frontend_exits(p); for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (i == 0) { /* only first backend allocated the memory for the start up packet */ pool_free_startup_packet(CONNECTION_SLOT(p, i)->sp); } pool_close(CONNECTION(p, i)); } info = p->info; memset(p, 0, sizeof(POOL_CONNECTION_POOL)); p->info = info; memset(p->info, 0, sizeof(ConnectionInfo)); } } }
/* * calculate next master node id */ static int get_next_master_node(void) { int i; for (i=0;i<pool_config->backend_desc->num_backends;i++) { /* * Do not use VALID_BACKEND macro in raw mode. * VALID_BACKEND return true only if the argument is master * node id. In other words, standby nodes are false. So need * to check backend status without VALID_BACKEND. */ if (RAW_MODE) { if (BACKEND_INFO(i).backend_status == CON_CONNECT_WAIT) break; } else if (VALID_BACKEND(i)) break; } return i; }
/* * Send extended query and wait for response * send_type: * -1: do not send this node_id * 0: send to all nodes * >0: send to this node_id */ POOL_STATUS pool_extended_send_and_wait(POOL_QUERY_CONTEXT *query_context, char *kind, int len, char *contents, int send_type, int node_id) { POOL_SESSION_CONTEXT *session_context; POOL_CONNECTION *frontend; POOL_CONNECTION_POOL *backend; bool is_commit; bool is_begin_read_write; int i; int str_len; int rewritten_len; char *str; char *rewritten_begin; session_context = pool_get_session_context(); frontend = session_context->frontend; backend = session_context->backend; is_commit = is_commit_or_rollback_query(query_context->parse_tree); is_begin_read_write = false; str_len = 0; rewritten_len = 0; str = NULL; rewritten_begin = NULL; /* * If the query is BEGIN READ WRITE or * BEGIN ... SERIALIZABLE in master/slave mode, * we send BEGIN to slaves/standbys instead. * original_query which is BEGIN READ WRITE is sent to primary. * rewritten_query which is BEGIN is sent to standbys. */ if (pool_need_to_treat_as_if_default_transaction(query_context)) { is_begin_read_write = true; if (*kind == 'P') { rewritten_begin = remove_read_write(len, contents, &rewritten_len); if (rewritten_begin == NULL) return POOL_END; } } if (!rewritten_begin) { str_len = len; str = contents; } /* Send query */ for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; else if (send_type < 0 && i == node_id) continue; else if (send_type > 0 && i != node_id) continue; /* * If in reset context, we send COMMIT/ABORT to nodes those * are not in I(idle) state. This will ensure that * transactions are closed. */ if (is_commit && session_context->reset_context && TSTATE(backend, i) == 'I') { pool_unset_node_to_be_sent(query_context, i); continue; } if (rewritten_begin) { if (REAL_PRIMARY_NODE_ID == i) { str = contents; str_len = len; } else { str = rewritten_begin; str_len = rewritten_len; } } if (pool_config->log_per_node_statement) { char msgbuf[QUERY_STRING_BUFFER_LEN]; char *stmt; if (*kind == 'P' || *kind == 'E') { if (query_context->rewritten_query) { if (is_begin_read_write) { if (REAL_PRIMARY_NODE_ID == i) stmt = query_context->original_query; else stmt = query_context->rewritten_query; } else { stmt = query_context->rewritten_query; } } else { stmt = query_context->original_query; } if (*kind == 'P') snprintf(msgbuf, sizeof(msgbuf), "Parse: %s", stmt); else snprintf(msgbuf, sizeof(msgbuf), "Execute: %s", stmt); } else { snprintf(msgbuf, sizeof(msgbuf), "%c message", *kind); } per_node_statement_log(backend, i, msgbuf); } if (send_extended_protocol_message(backend, i, kind, str_len, str) != POOL_CONTINUE) { free(rewritten_begin); return POOL_END; } } if (!is_begin_read_write) { if (query_context->rewritten_query) str = query_context->rewritten_query; else str = query_context->original_query; } /* Wait for response */ for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; else if (send_type < 0 && i == node_id) continue; else if (send_type > 0 && i != node_id) continue; /* * If in master/slave mode, we do not send COMMIT/ABORT to * slaves/standbys if it's in I(idle) state. */ if (is_commit && MASTER_SLAVE && !IS_MASTER_NODE_ID(i) && TSTATE(backend, i) == 'I') { continue; } if (is_begin_read_write) { if (REAL_PRIMARY_NODE_ID == i) str = query_context->original_query; else str = query_context->rewritten_query; } if (wait_for_query_response(frontend, CONNECTION(backend, i), MAJOR(backend)) != POOL_CONTINUE) { /* Cancel current transaction */ CancelPacket cancel_packet; cancel_packet.protoVersion = htonl(PROTO_CANCEL); cancel_packet.pid = MASTER_CONNECTION(backend)->pid; cancel_packet.key= MASTER_CONNECTION(backend)->key; cancel_request(&cancel_packet); free(rewritten_begin); return POOL_END; } /* * Check if some error detected. If so, emit * log. This is usefull when invalid encoding error * occurs. In this case, PostgreSQL does not report * what statement caused that error and make users * confused. */ per_node_error_log(backend, i, str, "pool_send_and_wait: Error or notice message from backend: ", true); } free(rewritten_begin); return POOL_CONTINUE; }
/* * Send simple query and wait for response * send_type: * -1: do not send this node_id * 0: send to all nodes * >0: send to this node_id */ POOL_STATUS pool_send_and_wait(POOL_QUERY_CONTEXT *query_context, int send_type, int node_id) { POOL_SESSION_CONTEXT *session_context; POOL_CONNECTION *frontend; POOL_CONNECTION_POOL *backend; bool is_commit; bool is_begin_read_write; int i; int len; char *string; session_context = pool_get_session_context(); frontend = session_context->frontend; backend = session_context->backend; is_commit = is_commit_or_rollback_query(query_context->parse_tree); is_begin_read_write = false; len = 0; string = NULL; /* * If the query is BEGIN READ WRITE or * BEGIN ... SERIALIZABLE in master/slave mode, * we send BEGIN to slaves/standbys instead. * original_query which is BEGIN READ WRITE is sent to primary. * rewritten_query which is BEGIN is sent to standbys. */ if (pool_need_to_treat_as_if_default_transaction(query_context)) { is_begin_read_write = true; } else { if (query_context->rewritten_query) { len = query_context->rewritten_length; string = query_context->rewritten_query; } else { len = query_context->original_length; string = query_context->original_query; } } /* Send query */ for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; else if (send_type < 0 && i == node_id) continue; else if (send_type > 0 && i != node_id) continue; /* * If in master/slave mode, we do not send COMMIT/ABORT to * slaves/standbys if it's in I(idle) state. */ if (is_commit && MASTER_SLAVE && !IS_MASTER_NODE_ID(i) && TSTATE(backend, i) == 'I') { pool_unset_node_to_be_sent(query_context, i); continue; } /* * If in reset context, we send COMMIT/ABORT to nodes those * are not in I(idle) state. This will ensure that * transactions are closed. */ if (is_commit && session_context->reset_context && TSTATE(backend, i) == 'I') { pool_unset_node_to_be_sent(query_context, i); continue; } if (is_begin_read_write) { if (REAL_PRIMARY_NODE_ID == i) { len = query_context->original_length; string = query_context->original_query; } else { len = query_context->rewritten_length; string = query_context->rewritten_query; } } per_node_statement_log(backend, i, string); if (send_simplequery_message(CONNECTION(backend, i), len, string, MAJOR(backend)) != POOL_CONTINUE) { return POOL_END; } } /* Wait for response */ for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; else if (send_type < 0 && i == node_id) continue; else if (send_type > 0 && i != node_id) continue; #ifdef NOT_USED /* * If in master/slave mode, we do not send COMMIT/ABORT to * slaves/standbys if it's in I(idle) state. */ if (is_commit && MASTER_SLAVE && !IS_MASTER_NODE_ID(i) && TSTATE(backend, i) == 'I') { continue; } #endif if (is_begin_read_write) { if(REAL_PRIMARY_NODE_ID == i) string = query_context->original_query; else string = query_context->rewritten_query; } if (wait_for_query_response(frontend, CONNECTION(backend, i), MAJOR(backend)) != POOL_CONTINUE) { /* Cancel current transaction */ CancelPacket cancel_packet; cancel_packet.protoVersion = htonl(PROTO_CANCEL); cancel_packet.pid = MASTER_CONNECTION(backend)->pid; cancel_packet.key= MASTER_CONNECTION(backend)->key; cancel_request(&cancel_packet); return POOL_END; } /* * Check if some error detected. If so, emit * log. This is usefull when invalid encoding error * occurs. In this case, PostgreSQL does not report * what statement caused that error and make users * confused. */ per_node_error_log(backend, i, string, "pool_send_and_wait: Error or notice message from backend: ", true); } return POOL_CONTINUE; }
/* * Start online recovery. * "recovery_node" is the node to be recovered. * Master or primary node is chosen in this function. */ void start_recovery(int recovery_node) { int node_id; BackendInfo *backend; BackendInfo *recovery_backend; PGconn *conn; int failback_wait_count; #define FAILBACK_WAIT_MAX_RETRY 5 /* 5 seconds should be enough for failback operation */ ereport(LOG, (errmsg("starting recovering node %d", recovery_node))); if ( (recovery_node < 0) || (recovery_node >= pool_config->backend_desc->num_backends) ) ereport(ERROR, (errmsg("node recovery failed, node id: %d is not valid", recovery_node))); if (VALID_BACKEND(recovery_node)) ereport(ERROR, (errmsg("node recovery failed, node id: %d is alive", recovery_node))); /* select master/primary node */ node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID; backend = &pool_config->backend_desc->backend_info[node_id]; /* get node info to be recovered */ recovery_backend = &pool_config->backend_desc->backend_info[recovery_node]; conn = connect_backend_libpq(backend); if (conn == NULL) ereport(ERROR, (errmsg("node recovery failed, unable to connect to master node: %d ", node_id))); PG_TRY(); { /* 1st stage */ if (REPLICATION) { exec_checkpoint(conn); ereport(LOG, (errmsg("node recovery, CHECKPOINT in the 1st stage done"))); } exec_recovery(conn, backend, recovery_backend, FIRST_STAGE); ereport(LOG, (errmsg("node recovery, 1st stage is done"))); if (REPLICATION) { ereport(LOG, (errmsg("node recovery, starting 2nd stage"))); /* 2nd stage */ *InRecovery = RECOVERY_ONLINE; if (pool_config->use_watchdog) { /* announce start recovery */ if (WD_OK != wd_start_recovery()) ereport(ERROR, (errmsg("node recovery failed, failed to send start recovery packet"))); } if (wait_connection_closed() != 0) ereport(ERROR, (errmsg("node recovery failed, waiting connection closed in the other pgpools timeout"))); ereport(LOG, (errmsg("node recovery, all connections from clients have been closed"))); exec_checkpoint(conn); ereport(LOG, (errmsg("node recovery"), errdetail("CHECKPOINT in the 2nd stage done"))); exec_recovery(conn, backend, recovery_backend, SECOND_STAGE); } exec_remote_start(conn, recovery_backend); check_postmaster_started(recovery_backend); ereport(LOG, (errmsg("node recovery, node: %d restarted", recovery_node))); /* * reset failover completion flag. this is necessary since * previous failover/failback will set the flag to 1. */ pcp_wakeup_request = 0; /* send failback request to pgpool parent */ send_failback_request(recovery_node); /* wait for failback */ failback_wait_count = 0; while (!pcp_wakeup_request) { struct timeval t = {1, 0}; /* polling SIGUSR2 signal every 1 sec */ select(0, NULL, NULL, NULL, &t); failback_wait_count++; if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY) { ereport(LOG, (errmsg("node recovery"), errdetail("waiting for wake up request is timeout(%d seconds)", FAILBACK_WAIT_MAX_RETRY))); break; } } pcp_wakeup_request = 0; } PG_CATCH(); { PQfinish(conn); PG_RE_THROW(); } PG_END_TRY(); PQfinish(conn); ereport(LOG, (errmsg("recovery done"))); }
/* * perform accept() and return new fd */ static POOL_CONNECTION *do_accept(int unix_fd, int inet_fd, struct timeval *timeout) { fd_set readmask; int fds; int save_errno; SockAddr saddr; int fd = 0; int afd; int inet = 0; POOL_CONNECTION *cp; #ifdef ACCEPT_PERFORMANCE struct timeval now1, now2; static long atime; static int cnt; #endif struct timeval *timeoutval; struct timeval tv1, tv2, tmback = {0, 0}; set_ps_display("wait for connection request", false); /* Destroy session context for just in case... */ pool_session_context_destroy(); FD_ZERO(&readmask); FD_SET(unix_fd, &readmask); if (inet_fd) FD_SET(inet_fd, &readmask); if (timeout->tv_sec == 0 && timeout->tv_usec == 0) timeoutval = NULL; else { timeoutval = timeout; tmback.tv_sec = timeout->tv_sec; tmback.tv_usec = timeout->tv_usec; gettimeofday(&tv1, NULL); #ifdef DEBUG pool_log("before select = {%d, %d}", timeoutval->tv_sec, timeoutval->tv_usec); pool_log("g:before select = {%d, %d}", tv1.tv_sec, tv1.tv_usec); #endif } fds = select(Max(unix_fd, inet_fd)+1, &readmask, NULL, NULL, timeoutval); save_errno = errno; /* check backend timer is expired */ if (backend_timer_expired) { pool_backend_timer(); backend_timer_expired = 0; } /* * following code fragment computes remaining timeout val in a * portable way. Linux does this automatically but other platforms do not. */ if (timeoutval) { gettimeofday(&tv2, NULL); tmback.tv_usec -= tv2.tv_usec - tv1.tv_usec; tmback.tv_sec -= tv2.tv_sec - tv1.tv_sec; if (tmback.tv_usec < 0) { tmback.tv_sec--; if (tmback.tv_sec < 0) { timeout->tv_sec = 0; timeout->tv_usec = 0; } else { tmback.tv_usec += 1000000; timeout->tv_sec = tmback.tv_sec; timeout->tv_usec = tmback.tv_usec; } } #ifdef DEBUG pool_log("g:after select = {%d, %d}", tv2.tv_sec, tv2.tv_usec); pool_log("after select = {%d, %d}", timeout->tv_sec, timeout->tv_usec); #endif } errno = save_errno; if (fds == -1) { if (errno == EAGAIN || errno == EINTR) return NULL; pool_error("select() failed. reason %s", strerror(errno)); return NULL; } /* timeout */ if (fds == 0) { return NULL; } if (FD_ISSET(unix_fd, &readmask)) { fd = unix_fd; } if (FD_ISSET(inet_fd, &readmask)) { fd = inet_fd; inet++; } /* * Note that some SysV systems do not work here. For those * systems, we need some locking mechanism for the fd. */ memset(&saddr, 0, sizeof(saddr)); saddr.salen = sizeof(saddr.addr); #ifdef ACCEPT_PERFORMANCE gettimeofday(&now1,0); #endif retry_accept: /* wait if recovery is started */ while (*InRecovery == 1) { pause(); } afd = accept(fd, (struct sockaddr *)&saddr.addr, &saddr.salen); save_errno = errno; /* check backend timer is expired */ if (backend_timer_expired) { pool_backend_timer(); backend_timer_expired = 0; } errno = save_errno; if (afd < 0) { if (errno == EINTR && *InRecovery) goto retry_accept; /* * "Resource temporarily unavailable" (EAGAIN or EWOULDBLOCK) * can be silently ignored. And EINTR can be ignored. */ if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) pool_error("accept() failed. reason: %s", strerror(errno)); return NULL; } #ifdef ACCEPT_PERFORMANCE gettimeofday(&now2,0); atime += (now2.tv_sec - now1.tv_sec)*1000000 + (now2.tv_usec - now1.tv_usec); cnt++; if (cnt % 100 == 0) { pool_log("cnt: %d atime: %ld", cnt, atime); } #endif /* reload config file */ if (got_sighup) { pool_get_config(get_config_file_name(), RELOAD_CONFIG); if (pool_config->enable_pool_hba) { load_hba(get_hba_file_name()); if (strcmp("", pool_config->pool_passwd)) pool_reopen_passwd_file(); } if (pool_config->parallel_mode) pool_memset_system_db_info(system_db_info->info); got_sighup = 0; } connection_count_up(); accepted = 1; if (pool_config->parallel_mode) { /* * do not accept new connection if any of DB node or SystemDB is down when operating in * parallel mode */ int i; for (i=0;i<NUM_BACKENDS;i++) { if (BACKEND_INFO(i).backend_status == CON_DOWN || SYSDB_STATUS == CON_DOWN) { StartupPacket *sp; char *msg = "pgpool is not available in parallel query mode"; if (SYSDB_STATUS == CON_DOWN) pool_log("Cannot accept() new connection. SystemDB is down"); else pool_log("Cannot accept() new connection. %d th backend is down", i); if ((cp = pool_open(afd)) == NULL) { close(afd); child_exit(1); } sp = read_startup_packet(cp); if (sp == NULL) { /* failed to read the startup packet. return to the accept() loop */ pool_close(cp); child_exit(1); } pool_debug("do_accept: send error message to frontend"); if (sp->major == PROTO_MAJOR_V3) { char buf[256]; if (SYSDB_STATUS == CON_DOWN) snprintf(buf, sizeof(buf), "SystemDB is down"); else snprintf(buf, sizeof(buf), "%d th backend is down", i); pool_send_error_message(cp, sp->major, "08S01", msg, buf, ((SYSDB_STATUS == CON_DOWN) ? "repair the SystemDB and restart pgpool" : "repair the backend and restart pgpool"), __FILE__, __LINE__); } else { pool_send_error_message(cp, sp->major, 0, msg, "", "", "", 0); } pool_close(cp); child_exit(1); } } } else { /* * do not accept new connection if all DB nodes are down when operating in * non parallel mode */ int i; int found = 0; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { found = 1; } } if (found == 0) { pool_log("Cannot accept() new connection. all backends are down"); child_exit(1); } } pool_debug("I am %d accept fd %d", getpid(), afd); pool_getnameinfo_all(&saddr, remote_host, remote_port); snprintf(remote_ps_data, sizeof(remote_ps_data), remote_port[0] == '\0' ? "%s" : "%s(%s)", remote_host, remote_port); set_ps_display("accept connection", false); /* log who is connecting */ if (pool_config->log_connections) { pool_log("connection received: host=%s%s%s", remote_host, remote_port[0] ? " port=" : "", remote_port); } /* set NODELAY and KEEPALIVE options if INET connection */ if (inet) { int on = 1; if (setsockopt(afd, IPPROTO_TCP, TCP_NODELAY, (char *) &on, sizeof(on)) < 0) { pool_error("do_accept: setsockopt() failed: %s", strerror(errno)); close(afd); return NULL; } if (setsockopt(afd, SOL_SOCKET, SO_KEEPALIVE, (char *) &on, sizeof(on)) < 0) { pool_error("do_accept: setsockopt() failed: %s", strerror(errno)); close(afd); return NULL; } } if ((cp = pool_open(afd)) == NULL) { close(afd); return NULL; } /* save ip address for hba */ memcpy(&cp->raddr, &saddr, sizeof(SockAddr)); if (cp->raddr.addr.ss_family == 0) cp->raddr.addr.ss_family = AF_UNIX; return cp; }
POOL_STATUS AsciiRow(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, short num_fields) { static char nullmap[8192], nullmap1[8192]; int nbytes; int i, j; unsigned char mask; int size, size1 = 0; char *buf = NULL, *sendbuf = NULL; char msgbuf[1024]; pool_write(frontend, "D", 1); nbytes = (num_fields + 7)/8; if (nbytes <= 0) return POOL_CONTINUE; /* NULL map */ pool_read(MASTER(backend), nullmap, nbytes); memcpy(nullmap1, nullmap, nbytes); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i) && !IS_MASTER_NODE_ID(i)) { pool_read(CONNECTION(backend, i), nullmap, nbytes); if (memcmp(nullmap, nullmap1, nbytes)) { /* XXX: NULLMAP maybe different among backends. If we were a paranoid, we have to treat this as a fatal error. However in the real world we'd better to adapt this situation. Just throw a log... */ pool_debug("AsciiRow: NULLMAP differ between master and %d th backend", i); } } } if (pool_write(frontend, nullmap1, nbytes) < 0) return POOL_END; mask = 0; for (i = 0;i<num_fields;i++) { if (mask == 0) mask = 0x80; /* NOT NULL? */ if (mask & nullmap[i/8]) { /* field size */ if (pool_read(MASTER(backend), &size, sizeof(int)) < 0) return POOL_END; size1 = ntohl(size) - 4; /* read and send actual data only when size > 0 */ if (size1 > 0) { sendbuf = pool_read2(MASTER(backend), size1); if (sendbuf == NULL) return POOL_END; } /* forward to frontend */ pool_write(frontend, &size, sizeof(int)); pool_write(frontend, sendbuf, size1); snprintf(msgbuf, Min(sizeof(msgbuf), size1+1), "%s", sendbuf); pool_debug("AsciiRow: len: %d data: %s", size1, msgbuf); for (j=0;j<NUM_BACKENDS;j++) { if (VALID_BACKEND(j) && !IS_MASTER_NODE_ID(j)) { /* field size */ if (pool_read(CONNECTION(backend, j), &size, sizeof(int)) < 0) return POOL_END; buf = NULL; size = ntohl(size) - 4; /* XXX: field size maybe different among backends. If we were a paranoid, we have to treat this as a fatal error. However in the real world we'd better to adapt this situation. Just throw a log... */ if (size != size1) pool_debug("AsciiRow: %d th field size does not match between master(%d) and %d th backend(%d)", i, ntohl(size), j, ntohl(size1)); /* read and send actual data only when size > 0 */ if (size > 0) { buf = pool_read2(CONNECTION(backend, j), size); if (buf == NULL) return POOL_END; } } } } mask >>= 1; } if (pool_flush(frontend)) return POOL_END; return POOL_CONTINUE; }
POOL_STATUS FunctionResultResponse(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend) { char dummy; int len; char *result = 0; int i; pool_write(frontend, "V", 1); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { if (pool_read(CONNECTION(backend, i), &dummy, 1) < 0) return POOL_ERROR; } } pool_write(frontend, &dummy, 1); /* non empty result? */ if (dummy == 'G') { for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* length of result in bytes */ if (pool_read(CONNECTION(backend, i), &len, sizeof(len)) < 0) return POOL_ERROR; } } pool_write(frontend, &len, sizeof(len)); len = ntohl(len); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* result value itself */ if ((result = pool_read2(MASTER(backend), len)) == NULL) return POOL_ERROR; } } pool_write(frontend, result, len); } for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) { /* unused ('0') */ if (pool_read(MASTER(backend), &dummy, 1) < 0) return POOL_ERROR; } } pool_write(frontend, "0", 1); return pool_flush(frontend); }
/* * process cancel request */ void cancel_request(CancelPacket *sp) { int len; int fd; POOL_CONNECTION *con; int i,j,k; ConnectionInfo *c = NULL; CancelPacket cp; bool found = false; pool_debug("Cancel request received"); /* look for cancel key from shmem info */ for (i=0;i<pool_config->num_init_children;i++) { for (j=0;j<pool_config->max_pool;j++) { for (k=0;k<NUM_BACKENDS;k++) { c = pool_coninfo(i, j, k); pool_debug("con_info: address:%p database:%s user:%s pid:%d key:%d i:%d", c, c->database, c->user, ntohl(c->pid), ntohl(c->key),i); if (c->pid == sp->pid && c->key == sp->key) { pool_debug("found pid:%d key:%d i:%d",ntohl(c->pid), ntohl(c->key),i); c = pool_coninfo(i, j, 0); found = true; goto found; } } } } found: if (!found) { pool_error("cancel_request: invalid cancel key: pid:%d key:%d",ntohl(sp->pid), ntohl(sp->key)); return; /* invalid key */ } for (i=0;i<NUM_BACKENDS;i++,c++) { if (!VALID_BACKEND(i)) continue; if (*(BACKEND_INFO(i).backend_hostname) == '/') fd = connect_unix_domain_socket(i, TRUE); else fd = connect_inet_domain_socket(i, TRUE); if (fd < 0) { pool_error("Could not create socket for sending cancel request for backend %d", i); return; } con = pool_open(fd); if (con == NULL) return; len = htonl(sizeof(len) + sizeof(CancelPacket)); pool_write(con, &len, sizeof(len)); cp.protoVersion = sp->protoVersion; cp.pid = c->pid; cp.key = c->key; pool_log("cancel_request: canceling backend pid:%d key: %d", ntohl(cp.pid),ntohl(cp.key)); if (pool_write_and_flush(con, &cp, sizeof(CancelPacket)) < 0) pool_error("Could not send cancel request packet for backend %d", i); pool_close(con); /* * this is needed to ensure that the next DB node executes the * query supposed to be canceled. */ sleep(1); } }
/* * Reuse existing connection */ static bool connect_using_existing_connection(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, StartupPacket *sp) { int i, freed = 0; /* * Save startup packet info */ for (i = 0; i < NUM_BACKENDS; i++) { if (VALID_BACKEND(i)) { if (!freed) { pool_free_startup_packet(backend->slots[i]->sp); freed = 1; } backend->slots[i]->sp = sp; } } /* Reuse existing connection to backend */ if (pool_do_reauth(frontend, backend)) { pool_close(frontend); connection_count_down(); return false; } if (MAJOR(backend) == 3) { char command_buf[1024]; /* If we have received application_name in the start up * packet, we send SET command to backend. Also we add or * replace existing application_name data. */ if (sp->application_name) { snprintf(command_buf, sizeof(command_buf), "SET application_name TO '%s'", sp->application_name); for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) if (do_command(frontend, CONNECTION(backend, i), command_buf, MAJOR(backend), MASTER_CONNECTION(backend)->pid, MASTER_CONNECTION(backend)->key, 0) != POOL_CONTINUE) { pool_error("connect_using_existing_connection: do_command failed. command: %s", command_buf); return false; } } pool_add_param(&MASTER(backend)->params, "application_name", sp->application_name); } if (send_params(frontend, backend)) { pool_close(frontend); connection_count_down(); return false; } } /* Send ReadyForQuery to frontend */ pool_write(frontend, "Z", 1); if (MAJOR(backend) == 3) { int len; char tstate; len = htonl(5); pool_write(frontend, &len, sizeof(len)); tstate = TSTATE(backend, MASTER_NODE_ID); pool_write(frontend, &tstate, 1); } if (pool_flush(frontend) < 0) { pool_close(frontend); connection_count_down(); return false; } return true; }
int RowDescription(POOL_CONNECTION *frontend, POOL_CONNECTION_POOL *backend, short *result) { short num_fields, num_fields1 = 0; int oid, mod; int oid1, mod1; short size, size1; char *string; int len, len1; int i; pool_read(MASTER(backend), &num_fields, sizeof(short)); num_fields1 = num_fields; for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i) && !IS_MASTER_NODE_ID(i)) { /* # of fields (could be 0) */ pool_read(CONNECTION(backend, i), &num_fields, sizeof(short)); if (num_fields != num_fields1) { pool_error("RowDescription: num_fields does not match between backends master(%d) and %d th backend(%d)", num_fields, i, num_fields1); return POOL_FATAL; } } } /* forward it to the frontend */ pool_write(frontend, "T", 1); pool_write(frontend, &num_fields, sizeof(short)); num_fields = ntohs(num_fields); for (i = 0;i<num_fields;i++) { int j; /* field name */ string = pool_read_string(MASTER(backend), &len, 0); if (string == NULL) return POOL_END; len1 = len; if (pool_write(frontend, string, len) < 0) return POOL_END; for (j=0;j<NUM_BACKENDS;j++) { if (VALID_BACKEND(j) && !IS_MASTER_NODE_ID(j)) { string = pool_read_string(CONNECTION(backend, j), &len, 0); if (string == NULL) return POOL_END; if (len != len1) { pool_error("RowDescription: field length does not match between backends master(%d) and %d th backend(%d)", ntohl(len), j, ntohl(len1)); return POOL_FATAL; } } } /* type oid */ pool_read(MASTER(backend), &oid, sizeof(int)); oid1 = oid; pool_debug("RowDescription: type oid: %d", ntohl(oid)); for (j=0;j<NUM_BACKENDS;j++) { if (VALID_BACKEND(j) && !IS_MASTER_NODE_ID(j)) { pool_read(CONNECTION(backend, j), &oid, sizeof(int)); /* we do not regard oid mismatch as fatal */ if (oid != oid1) { pool_debug("RowDescription: field oid does not match between backends master(%d) and %d th backend(%d)", ntohl(oid), j, ntohl(oid1)); } } } if (pool_write(frontend, &oid1, sizeof(int)) < 0) return POOL_END; /* size */ pool_read(MASTER(backend), &size, sizeof(short)); size1 = size; for (j=0;j<NUM_BACKENDS;j++) { if (VALID_BACKEND(j) && !IS_MASTER_NODE_ID(j)) { pool_read(CONNECTION(backend, j), &size, sizeof(short)); if (size1 != size1) { pool_error("RowDescription: field size does not match between backends master(%d) and %d th backend(%d)", ntohs(size), j, ntohs(size1)); return POOL_FATAL; } } } pool_debug("RowDescription: field size: %d", ntohs(size)); pool_write(frontend, &size1, sizeof(short)); /* modifier */ pool_read(MASTER(backend), &mod, sizeof(int)); pool_debug("RowDescription: modifier: %d", ntohs(mod)); mod1 = mod; for (j=0;j<NUM_BACKENDS;j++) { if (VALID_BACKEND(j) && !IS_MASTER_NODE_ID(j)) { pool_read(CONNECTION(backend, j), &mod, sizeof(int)); if (mod != mod1) { pool_debug("RowDescription: modifier does not match between backends master(%d) and %d th backend(%d)", ntohl(mod), j, ntohl(mod1)); } } } if (pool_write(frontend, &mod1, sizeof(int)) < 0) return POOL_END; } *result = num_fields; return pool_flush(frontend); }
/* * create actual connections to backends */ static POOL_CONNECTION_POOL *new_connection(POOL_CONNECTION_POOL *p) { POOL_CONNECTION_POOL_SLOT *s; int active_backend_count = 0; int i; for (i=0;i<NUM_BACKENDS;i++) { pool_debug("new_connection: connecting %d backend", i); if (!VALID_BACKEND(i)) { pool_debug("new_connection: skipping slot %d because backend_status = %d", i, BACKEND_INFO(i).backend_status); continue; } s = malloc(sizeof(POOL_CONNECTION_POOL_SLOT)); if (s == NULL) { pool_error("new_connection: malloc() failed"); return NULL; } if (create_cp(s, i) == NULL) { /* connection failed. mark this backend down */ pool_error("new_connection: create_cp() failed"); /* If fail_over_on_backend_error is true, do failover. * Otherwise, just exit this session. */ if (pool_config->fail_over_on_backend_error) { notice_backend_error(i); } else { pool_log("new_connection: do not failover because fail_over_on_backend_error is off"); } child_exit(1); } p->info[i].create_time = time(NULL); p->slots[i] = s; if (pool_init_params(&s->con->params)) { return NULL; } BACKEND_INFO(i).backend_status = CON_UP; active_backend_count++; } if (active_backend_count > 0) { return p; } return NULL; }
/* * create a connection pool by user and database */ POOL_CONNECTION_POOL *pool_create_cp(void) { int i, freed = 0; time_t closetime; POOL_CONNECTION_POOL *oldestp; POOL_CONNECTION_POOL *ret; ConnectionInfo *info; POOL_CONNECTION_POOL *p = pool_connection_pool; if (p == NULL) { pool_error("pool_create_cp: pool_connection_pool is not initialized"); return NULL; } for (i=0;i<pool_config->max_pool;i++) { if (MASTER_CONNECTION(p) == NULL) { ret = new_connection(p); if (ret) pool_index = i; return ret; } p++; } pool_debug("no empty connection slot was found"); /* * no empty connection slot was found. look for the oldest connection and discard it. */ oldestp = p = pool_connection_pool; closetime = MASTER_CONNECTION(p)->closetime; pool_index = 0; for (i=0;i<pool_config->max_pool;i++) { pool_debug("user: %s database: %s closetime: %ld", MASTER_CONNECTION(p)->sp->user, MASTER_CONNECTION(p)->sp->database, MASTER_CONNECTION(p)->closetime); if (MASTER_CONNECTION(p)->closetime < closetime) { closetime = MASTER_CONNECTION(p)->closetime; oldestp = p; pool_index = i; } p++; } p = oldestp; pool_send_frontend_exits(p); pool_debug("discarding old %zd th connection. user: %s database: %s", oldestp - pool_connection_pool, MASTER_CONNECTION(p)->sp->user, MASTER_CONNECTION(p)->sp->database); for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (!freed) { pool_free_startup_packet(CONNECTION_SLOT(p, i)->sp); freed = 1; } pool_close(CONNECTION(p, i)); free(CONNECTION_SLOT(p, i)); } info = p->info; memset(p, 0, sizeof(POOL_CONNECTION_POOL)); p->info = info; memset(p->info, 0, sizeof(ConnectionInfo) * MAX_NUM_BACKENDS); ret = new_connection(p); return ret; }
void pool_backend_timer(void) { #define TMINTMAX 0x7fffffff POOL_CONNECTION_POOL *p = pool_connection_pool; int i, j; time_t now; time_t nearest = TMINTMAX; ConnectionInfo *info; POOL_SETMASK(&BlockSig); now = time(NULL); pool_debug("pool_backend_timer_handler called at %ld", now); for (i=0;i<pool_config->max_pool;i++, p++) { if (!MASTER_CONNECTION(p)) continue; if (!MASTER_CONNECTION(p)->sp) continue; if (MASTER_CONNECTION(p)->sp->user == NULL) continue; /* timer expire? */ if (MASTER_CONNECTION(p)->closetime) { int freed = 0; pool_debug("pool_backend_timer_handler: expire time: %ld", MASTER_CONNECTION(p)->closetime+pool_config->connection_life_time); if (now >= (MASTER_CONNECTION(p)->closetime+pool_config->connection_life_time)) { /* discard expired connection */ pool_debug("pool_backend_timer_handler: expires user %s database %s", MASTER_CONNECTION(p)->sp->user, MASTER_CONNECTION(p)->sp->database); pool_send_frontend_exits(p); for (j=0;j<NUM_BACKENDS;j++) { if (!VALID_BACKEND(j)) continue; if (!freed) { pool_free_startup_packet(CONNECTION_SLOT(p, j)->sp); freed = 1; } pool_close(CONNECTION(p, j)); free(CONNECTION_SLOT(p, j)); } info = p->info; memset(p, 0, sizeof(POOL_CONNECTION_POOL)); p->info = info; memset(p->info, 0, sizeof(ConnectionInfo) * MAX_NUM_BACKENDS); /* prepare to shutdown connections to system db */ if(pool_config->system_db_dynamic_connection && (pool_config->parallel_mode || pool_config->enable_query_cache)) { if (system_db_info->pgconn) pool_close_libpq_connection(); if (pool_system_db_connection() && pool_system_db_connection()->con) { pool_send_frontend_exit(pool_system_db_connection()->con); pool_close(pool_system_db_connection()->con); } if( system_db_info->connection ) { free( system_db_info->connection ); memset(system_db_info->connection, 0, sizeof(POOL_CONNECTION_POOL_SLOT)); system_db_info->connection = NULL; } } } else { /* look for nearest timer */ if (MASTER_CONNECTION(p)->closetime < nearest) nearest = MASTER_CONNECTION(p)->closetime; } } } /* any remaining timer */ if (nearest != TMINTMAX) { nearest = pool_config->connection_life_time - (now - nearest); if (nearest <= 0) nearest = 1; pool_signal(SIGALRM, pool_backend_timer_handler); alarm(nearest); } POOL_SETMASK(&UnBlockSig); }
/* * backend connection error, failover/failback request, if possible * failover() must be called under protecting signals. */ static void failover(void) { int i; int node_id; int new_master; int nodes[MAX_NUM_BACKENDS]; pool_debug("failover_handler called"); memset(nodes, 0, sizeof(int) * MAX_NUM_BACKENDS); /* * this could happen in a child process if a signal has been sent * before resetting signal handler */ if (getpid() != mypid) { pool_debug("failover_handler: I am not parent"); kill(pcp_pid, SIGUSR2); return; } /* * processing SIGTERM, SIGINT or SIGQUIT */ if (exiting) { pool_debug("failover_handler called while exiting"); kill(pcp_pid, SIGUSR2); return; } /* * processing fail over or switch over */ if (switching) { pool_debug("failover_handler called while switching"); kill(pcp_pid, SIGUSR2); return; } pool_semaphore_lock(REQUEST_INFO_SEM); if (Req_info->kind == CLOSE_IDLE_REQUEST) { pool_semaphore_unlock(REQUEST_INFO_SEM); kill_all_children(SIGUSR1); kill(pcp_pid, SIGUSR2); return; } /* * if not in replication mode/master slave mode, we treat this a restart request. * otherwise we need to check if we have already failovered. */ pool_debug("failover_handler: starting to select new master node"); switching = 1; node_id = Req_info->node_id[0]; /* failback request? */ if (Req_info->kind == NODE_UP_REQUEST) { if (node_id >= MAX_NUM_BACKENDS || (Req_info->kind == NODE_UP_REQUEST && VALID_BACKEND(node_id)) || (Req_info->kind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id))) { pool_semaphore_unlock(REQUEST_INFO_SEM); pool_error("failover_handler: invalid node_id %d status:%d MAX_NUM_BACKENDS: %d", node_id, BACKEND_INFO(node_id).backend_status, MAX_NUM_BACKENDS); kill(pcp_pid, SIGUSR2); switching = 0; return; } pool_log("starting fail back. reconnect host %s(%d)", BACKEND_INFO(node_id).backend_hostname, BACKEND_INFO(node_id).backend_port); BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT; /* unset down status */ trigger_failover_command(node_id, pool_config->failback_command); } else { int cnt = 0; for (i = 0; i < MAX_NUM_BACKENDS; i++) { if (Req_info->node_id[i] != -1 && VALID_BACKEND(Req_info->node_id[i])) { pool_log("starting degeneration. shutdown host %s(%d)", BACKEND_INFO(Req_info->node_id[i]).backend_hostname, BACKEND_INFO(Req_info->node_id[i]).backend_port); BACKEND_INFO(Req_info->node_id[i]).backend_status = CON_DOWN; /* set down status */ /* save down node */ nodes[Req_info->node_id[i]] = 1; cnt++; } } if (cnt == 0) { pool_log("failover: no backends are degenerated"); pool_semaphore_unlock(REQUEST_INFO_SEM); kill(pcp_pid, SIGUSR2); switching = 0; return; } } new_master = get_next_master_node(); if (new_master == pool_config->backend_desc->num_backends) { pool_error("failover_handler: no valid DB node found"); } /* * Before we tried to minimize restarting pgpool to protect existing * connections from clients to pgpool children. What we did here was, * if children other than master went down, we did not fail over. * This is wrong. Think about following scenario. If someone * accidentally plugs out the network cable, the TCP/IP stack keeps * retrying for long time (typically 2 hours). The only way to stop * the retry is restarting the process. Bottom line is, we need to * restart all children in any case. See pgpool-general list posting * "TCP connections are *not* closed when a backend timeout" on Jul 13 * 2008 for more details. */ #ifdef NOT_USED else { if (Req_info->master_node_id == new_master && *InRecovery == 0) { pool_log("failover_handler: do not restart pgpool. same master node %d was selected", new_master); if (Req_info->kind == NODE_UP_REQUEST) { pool_log("failback done. reconnect host %s(%d)", BACKEND_INFO(node_id).backend_hostname, BACKEND_INFO(node_id).backend_port); } else { pool_log("failover done. shutdown host %s(%d)", BACKEND_INFO(node_id).backend_hostname, BACKEND_INFO(node_id).backend_port); } /* exec failover_command */ for (i = 0; i < pool_config->backend_desc->num_backends; i++) { if (nodes[i]) trigger_failover_command(i, pool_config->failover_command); } pool_semaphore_unlock(REQUEST_INFO_SEM); switching = 0; kill(pcp_pid, SIGUSR2); switching = 0; return; } } #endif /* kill all children */ for (i = 0; i < pool_config->num_init_children; i++) { pid_t pid = pids[i].pid; if (pid) { kill(pid, SIGQUIT); pool_debug("failover_handler: kill %d", pid); } } /* exec failover_command */ for (i = 0; i < pool_config->backend_desc->num_backends; i++) { if (nodes[i]) trigger_failover_command(i, pool_config->failover_command); } pool_log("failover_handler: set new master node: %d", new_master); Req_info->master_node_id = new_master; /* no need to wait since it will be done in reap_handler */ #ifdef NOT_USED while (wait(NULL) > 0) ; if (errno != ECHILD) pool_error("failover_handler: wait() failed. reason:%s", strerror(errno)); #endif memset(Req_info->node_id, -1, sizeof(int) * MAX_NUM_BACKENDS); pool_semaphore_unlock(REQUEST_INFO_SEM); /* fork the children */ for (i=0;i<pool_config->num_init_children;i++) { pids[i].pid = fork_a_child(unix_fd, inet_fd, i); pids[i].start_time = time(NULL); } if (Req_info->kind == NODE_UP_REQUEST) { pool_log("failback done. reconnect host %s(%d)", BACKEND_INFO(node_id).backend_hostname, BACKEND_INFO(node_id).backend_port); } else { pool_log("failover done. shutdown host %s(%d)", BACKEND_INFO(node_id).backend_hostname, BACKEND_INFO(node_id).backend_port); } switching = 0; /* kick wakeup_handler in pcp_child to notice that * faiover/failback done */ kill(pcp_pid, SIGUSR2); }
/* * find connection by user and database */ POOL_CONNECTION_POOL *pool_get_cp(char *user, char *database, int protoMajor, int check_socket) { #ifdef HAVE_SIGPROCMASK sigset_t oldmask; #else int oldmask; #endif int i, freed = 0; ConnectionInfo *info; POOL_CONNECTION_POOL *p = pool_connection_pool; if (p == NULL) { pool_error("pool_get_cp: pool_connection_pool is not initialized"); return NULL; } POOL_SETMASK2(&BlockSig, &oldmask); for (i=0;i<pool_config->max_pool;i++) { if (MASTER_CONNECTION(p) && MASTER_CONNECTION(p)->sp && MASTER_CONNECTION(p)->sp->major == protoMajor && MASTER_CONNECTION(p)->sp->user != NULL && strcmp(MASTER_CONNECTION(p)->sp->user, user) == 0 && strcmp(MASTER_CONNECTION(p)->sp->database, database) == 0) { int sock_broken = 0; int j; /* mark this connection is under use */ MASTER_CONNECTION(p)->closetime = 0; for (j=0;j<NUM_BACKENDS;j++) { p->info[j].counter++; } POOL_SETMASK(&oldmask); if (check_socket) { for (j=0;j<NUM_BACKENDS;j++) { if (!VALID_BACKEND(j)) continue; if (CONNECTION_SLOT(p, j)) { sock_broken = check_socket_status(CONNECTION(p, j)->fd); if (sock_broken < 0) break; } else { sock_broken = -1; break; } } if (sock_broken < 0) { pool_log("connection closed. retry to create new connection pool."); for (j=0;j<NUM_BACKENDS;j++) { if (!VALID_BACKEND(j) || (CONNECTION_SLOT(p, j) == NULL)) continue; if (!freed) { pool_free_startup_packet(CONNECTION_SLOT(p, j)->sp); freed = 1; } pool_close(CONNECTION(p, j)); free(CONNECTION_SLOT(p, j)); } info = p->info; memset(p, 0, sizeof(POOL_CONNECTION_POOL_SLOT)); p->info = info; memset(p->info, 0, sizeof(ConnectionInfo) * MAX_NUM_BACKENDS); POOL_SETMASK(&oldmask); return NULL; } } POOL_SETMASK(&oldmask); pool_index = i; return p; } p++; } POOL_SETMASK(&oldmask); return NULL; }
/* * Start online recovery. * "recovery_node" is the node to be recovered. * Master or primary node is chosen in this function. */ int start_recovery(int recovery_node) { int node_id; BackendInfo *backend; BackendInfo *recovery_backend; PGconn *conn; int failback_wait_count; #define FAILBACK_WAIT_MAX_RETRY 5 /* 5 seconds should be enough for failback operation */ pool_log("starting recovering node %d", recovery_node); if (VALID_BACKEND(recovery_node)) { pool_error("start_recovery: backend node %d is alive", recovery_node); return 1; } Req_info->kind = NODE_RECOVERY_REQUEST; /* select master/primary node */ node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID; backend = &pool_config->backend_desc->backend_info[node_id]; /* get node info to be recovered */ recovery_backend = &pool_config->backend_desc->backend_info[recovery_node]; conn = connect_backend_libpq(backend); if (conn == NULL) { pool_error("start_recovery: could not connect master node (%d)", node_id); return 1; } /* 1st stage */ if (REPLICATION) { if (exec_checkpoint(conn) != 0) { PQfinish(conn); pool_error("start_recovery: CHECKPOINT failed"); return 1; } pool_log("CHECKPOINT in the 1st stage done"); } if (exec_recovery(conn, recovery_backend, FIRST_STAGE) != 0) { PQfinish(conn); return 1; } pool_log("1st stage is done"); if (REPLICATION) { pool_log("starting 2nd stage"); /* 2nd stage */ *InRecovery = RECOVERY_ONLINE; if (pool_config->use_watchdog) { /* announce start recovery */ if (WD_OK != wd_start_recovery()) { PQfinish(conn); pool_error("start_recovery: timeover for waiting connection closed in the other pgpools"); return 1; } } if (wait_connection_closed() != 0) { PQfinish(conn); pool_error("start_recovery: timeover for waiting connection closed"); return 1; } pool_log("all connections from clients have been closed"); if (exec_checkpoint(conn) != 0) { PQfinish(conn); pool_error("start_recovery: CHECKPOINT failed"); return 1; } pool_log("CHECKPOINT in the 2nd stage done"); if (exec_recovery(conn, recovery_backend, SECOND_STAGE) != 0) { PQfinish(conn); return 1; } } if (exec_remote_start(conn, recovery_backend) != 0) { PQfinish(conn); pool_error("start_recovery: remote start failed"); return 1; } if (check_postmaster_started(recovery_backend)) { PQfinish(conn); pool_error("start_recovery: check start failed"); return 1; } pool_log("%d node restarted", recovery_node); /* * reset failover completion flag. this is necessary since * previous failover/failback will set the flag to 1. */ pcp_wakeup_request = 0; /* send failback request to pgpool parent */ send_failback_request(recovery_node); /* wait for failback */ failback_wait_count = 0; while (!pcp_wakeup_request) { struct timeval t = {1, 0}; /* polling SIGUSR2 signal every 1 sec */ select(0, NULL, NULL, NULL, &t); failback_wait_count++; if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY) { pool_log("start_recovery: waiting for wake up request is timeout(%d seconds)", FAILBACK_WAIT_MAX_RETRY); break; } } pcp_wakeup_request = 0; PQfinish(conn); pool_log("recovery done"); return 0; }