/* * Establish persistent connection to backend */ static void establish_persistent_connection(void) { int i; BackendInfo *bkinfo; POOL_CONNECTION_POOL_SLOT *s; for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (slots[i] == NULL) { bkinfo = pool_get_node_info(i); s = make_persistent_db_connection(bkinfo->backend_hostname, bkinfo->backend_port, "postgres", pool_config->sr_check_user, pool_config->sr_check_password, true); if (s) slots[i] = s; else slots[i] = NULL; } } }
static void inform_node_info(PCP_CONNECTION *frontend,char *buf) { int node_id; int wsize; char port_str[6]; char status[2]; char weight_str[20]; char code[] = "CommandComplete"; BackendInfo *bi = NULL; node_id = atoi(buf); bi = pool_get_node_info(node_id); if (bi == NULL) ereport(ERROR, (errmsg("informing node info failed"), errdetail("invalid node ID"))); ereport(DEBUG2, (errmsg("PCP: informing node info"), errdetail("retrieved node information from shared memory"))); snprintf(port_str, sizeof(port_str), "%d", bi->backend_port); snprintf(status, sizeof(status), "%d", bi->backend_status); snprintf(weight_str, sizeof(weight_str), "%f", bi->backend_weight); pcp_write(frontend, "i", 1); wsize = htonl(sizeof(code) + strlen(bi->backend_hostname)+1 + strlen(port_str)+1 + strlen(status)+1 + strlen(weight_str)+1 + sizeof(int)); pcp_write(frontend, &wsize, sizeof(int)); pcp_write(frontend, code, sizeof(code)); pcp_write(frontend, bi->backend_hostname, strlen(bi->backend_hostname)+1); pcp_write(frontend, port_str, strlen(port_str)+1); pcp_write(frontend, status, strlen(status)+1); pcp_write(frontend, weight_str, strlen(weight_str)+1); do_pcp_flush(frontend); }
POOL_REPORT_NODES* get_nodes(int *nrows) { int i; POOL_REPORT_NODES* nodes = malloc(NUM_BACKENDS * sizeof(POOL_REPORT_NODES)); BackendInfo *bi = NULL; for (i = 0; i < NUM_BACKENDS; i++) { bi = pool_get_node_info(i); snprintf(nodes[i].node_id, POOLCONFIG_MAXSTATLEN, "%d", i); strncpy(nodes[i].hostname, bi->backend_hostname, strlen(bi->backend_hostname)+1); snprintf(nodes[i].port, POOLCONFIG_MAXIDENTLEN, "%d", bi->backend_port); snprintf(nodes[i].status, POOLCONFIG_MAXSTATLEN, "%d", bi->backend_status); snprintf(nodes[i].lb_weight, POOLCONFIG_MAXWEIGHTLEN, "%f", bi->backend_weight/RAND_MAX); } *nrows = i; return nodes; }
/* * Decide where to send queries(thus expecting response) */ void pool_where_to_send(POOL_QUERY_CONTEXT *query_context, char *query, Node *node) { POOL_SESSION_CONTEXT *session_context; POOL_CONNECTION_POOL *backend; int i; if (!query_context) { pool_error("pool_where_to_send: no query context"); return; } session_context = pool_get_session_context(); backend = session_context->backend; /* * Zap out DB node map */ pool_clear_node_to_be_sent(query_context); /* * If there is "NO LOAD BALANCE" comment, we send only to master node. */ if (!strncasecmp(query, NO_LOAD_BALANCE, NO_LOAD_BALANCE_COMMENT_SZ)) { pool_set_node_to_be_sent(query_context, MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID); for (i=0;i<NUM_BACKENDS;i++) { if (query_context->where_to_send[i]) { query_context->virtual_master_node_id = i; break; } } return; } /* * In raw mode, we send only to master node. Simple enough. */ if (RAW_MODE) { pool_set_node_to_be_sent(query_context, REAL_MASTER_NODE_ID); } else if (MASTER_SLAVE && query_context->is_multi_statement) { /* * If we are in master/slave mode and we have multi stametemt * query, we should send it to primary server only. Otherwise * it is possible to send a write query to standby servers * because we only use the first element of the multi * statement query and don't care about the rest. Typical * situation where we are bugged by this is, "BEGIN;DELETE * FROM table;END". Note that from pgpool-II 3.1.0 * transactional statements such as "BEGIN" is unconditionaly * sent to all nodes(see send_to_where() for more details). * Someday we might be able to understand all part of multi * statement queries, but until that day we need this band * aid. */ if (query_context->is_multi_statement) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } } else if (MASTER_SLAVE) { POOL_DEST dest; POOL_MEMORY_POOL *old_context; old_context = pool_memory_context_switch_to(query_context->memory_context); dest = send_to_where(node, query); pool_memory_context_switch_to(old_context); pool_debug("send_to_where: %d query: %s", dest, query); /* Should be sent to primary only? */ if (dest == POOL_PRIMARY) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } /* Should be sent to both primary and standby? */ else if (dest == POOL_BOTH) { pool_setall_node_to_be_sent(query_context); } /* * Ok, we might be able to load balance the SELECT query. */ else { if (pool_config->load_balance_mode && is_select_query(node, query) && MAJOR(backend) == PROTO_MAJOR_V3) { /* * If (we are outside of an explicit transaction) OR * (the transaction has not issued a write query yet, AND * transaction isolation level is not SERIALIZABLE) * we might be able to load balance. */ if (TSTATE(backend, PRIMARY_NODE_ID) == 'I' || (!pool_is_writing_transaction() && !pool_is_failed_transaction() && pool_get_transaction_isolation() != POOL_SERIALIZABLE)) { BackendInfo *bkinfo = pool_get_node_info(session_context->load_balance_node_id); /* * Load balance if possible */ /* * If replication delay is too much, we prefer to send to the primary. */ if (!strcmp(pool_config->master_slave_sub_mode, MODE_STREAMREP) && pool_config->delay_threshold && bkinfo->standby_delay > pool_config->delay_threshold) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } /* * If a writing function call is used, * we prefer to send to the primary. */ else if (pool_has_function_call(node)) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } /* * If system catalog is used in the SELECT, we * prefer to send to the primary. Example: SELECT * * FROM pg_class WHERE relname = 't1'; Because * 't1' is a constant, it's hard to recognize as * table name. Most use case such query is * against system catalog, and the table name can * be a temporary table, it's best to query * against primary system catalog. * Please note that this test must be done *before* * test using pool_has_temp_table. */ else if (pool_has_system_catalog(node)) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } /* * If temporary table is used in the SELECT, * we prefer to send to the primary. */ else if (pool_config->check_temp_table && pool_has_temp_table(node)) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } /* * If unlogged table is used in the SELECT, * we prefer to send to the primary. */ else if (pool_has_unlogged_table(node)) { pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } else { pool_set_node_to_be_sent(query_context, session_context->load_balance_node_id); } } else { /* Send to the primary only */ pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } } else { /* Send to the primary only */ pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID); } } } else if (REPLICATION || PARALLEL_MODE) { if (is_select_query(node, query)) { /* * If a writing function call is used or replicate_select is true, * we prefer to send to all nodes. */ if ((pool_has_function_call(node) || pool_config->replicate_select)) { pool_setall_node_to_be_sent(query_context); } else if (pool_config->load_balance_mode && MAJOR(backend) == PROTO_MAJOR_V3 && TSTATE(backend, MASTER_NODE_ID) == 'I') { /* load balance */ pool_set_node_to_be_sent(query_context, session_context->load_balance_node_id); } else { /* only send to master node */ pool_set_node_to_be_sent(query_context, REAL_MASTER_NODE_ID); } } else if (IsA(node, DeclareCursorStmt) || IsA(node, ClosePortalStmt) || IsA(node, FetchStmt)) { if (query_context->loadbalance_cursor) { if (pool_config->load_balance_mode && MAJOR(backend) == PROTO_MAJOR_V3 && TSTATE(backend, MASTER_NODE_ID) == 'I') { /* load balance */ pool_set_node_to_be_sent(query_context, session_context->load_balance_node_id); } else { /* only send to master node */ pool_set_node_to_be_sent(query_context, REAL_MASTER_NODE_ID); } } else { /* send to all nodes */ pool_setall_node_to_be_sent(query_context); } } else { /* send to all nodes */ pool_setall_node_to_be_sent(query_context); } } else { pool_error("pool_where_to_send: unknown mode"); return; } /* * EXECUTE? */ if (IsA(node, ExecuteStmt)) { POOL_SENT_MESSAGE *msg; msg = pool_get_sent_message('Q', ((ExecuteStmt *)node)->name); if (!msg) msg = pool_get_sent_message('P', ((ExecuteStmt *)node)->name); if (msg) pool_copy_prep_where(msg->query_context->where_to_send, query_context->where_to_send); } /* * DEALLOCATE? */ else if (IsA(node, DeallocateStmt)) { where_to_send_deallocate(query_context, node); } for (i=0;i<NUM_BACKENDS;i++) { if (query_context->where_to_send[i]) { query_context->virtual_master_node_id = i; break; } } return; }
/* * trigger_failover_command: execute specified command at failover. * command_line is null-terminated string. */ static int trigger_failover_command(int node, const char *command_line) { int r = 0; String *exec_cmd; char port_buf[6]; char buf[2]; BackendInfo *info; if (command_line == NULL || (strlen(command_line) == 0)) return 0; /* check nodeID */ if (node < 0 || node > NUM_BACKENDS) return -1; info = pool_get_node_info(node); if (!info) return -1; buf[1] = '\0'; pool_memory = pool_memory_create(PREPARE_BLOCK_SIZE); if (!pool_memory) { pool_error("trigger_failover_command: pool_memory_create() failed"); return -1; } exec_cmd = init_string(""); while (*command_line) { if (*command_line == '%') { if (*(command_line + 1)) { char val = *(command_line + 1); switch (val) { case 'p': /* port */ snprintf(port_buf, sizeof(port_buf), "%d", info->backend_port); string_append_char(exec_cmd, port_buf); break; case 'D': /* database directory */ string_append_char(exec_cmd, info->backend_data_directory); break; case 'd': /* node id */ snprintf(port_buf, sizeof(port_buf), "%d", node); string_append_char(exec_cmd, port_buf); break; case 'h': /* host name */ string_append_char(exec_cmd, info->backend_hostname); break; case 'm': /* new master node id */ snprintf(port_buf, sizeof(port_buf), "%d", get_next_master_node()); string_append_char(exec_cmd, port_buf); break; case 'M': /* old master node id */ snprintf(port_buf, sizeof(port_buf), "%d", MASTER_NODE_ID); string_append_char(exec_cmd, port_buf); break; case '%': /* escape */ string_append_char(exec_cmd, "%"); break; default: /* ignore */ break; } command_line++; } } else { buf[0] = *command_line; string_append_char(exec_cmd, buf); } command_line++; } if (strlen(exec_cmd->data) != 0) { pool_log("execute command: %s", exec_cmd->data); r = system(exec_cmd->data); } pool_memory_delete(pool_memory, 0); pool_memory = NULL; return r; }
/* * Check replicaton time lag */ static void check_replication_time_lag(void) { int i; int active_nodes = 0; POOL_STATUS sts; POOL_SELECT_RESULT *res; unsigned long long int lsn[MAX_NUM_BACKENDS]; char *query; BackendInfo *bkinfo; unsigned long long int lag; if (NUM_BACKENDS <= 1) { /* If there's only one node, there's no point to do checking */ return; } /* Count healthy nodes */ for (i=0;i<NUM_BACKENDS;i++) { if (VALID_BACKEND(i)) active_nodes++; } if (active_nodes <= 1) { /* If there's only one or less active node, there's no point * to do checking */ return; } for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; if (!slots[i]) { pool_debug("check_replication_time_lag: DB node is valid but no persistent connection"); pool_error("check_replication_time_lag: could not connect to DB node %d, check sr_check_user and sr_check_password", i); return; } if (PRIMARY_NODE_ID == i) { query = "SELECT pg_current_xlog_location()"; } else { query = "SELECT pg_last_xlog_replay_location()"; } sts = do_query(slots[i]->con, query, &res, PROTO_MAJOR_V3); if (sts != POOL_CONTINUE) { pool_error("check_replication_time_lag: %s failed", query); return; } if (!res) { pool_error("check_replication_time_lag: %s result is null", query); return; } if (res->numrows <= 0) { pool_error("check_replication_time_lag: %s returns no rows", query); free_select_result(res); return; } if (res->data[0] == NULL) { pool_error("check_replication_time_lag: %s returns no data", query); free_select_result(res); return; } if (res->nullflags[0] == -1) { pool_log("check_replication_time_lag: %s returns NULL", query); free_select_result(res); lsn[i] = 0; } else { lsn[i] = text_to_lsn(res->data[0]); free_select_result(res); } } for (i=0;i<NUM_BACKENDS;i++) { if (!VALID_BACKEND(i)) continue; /* Set standby delay value */ bkinfo = pool_get_node_info(i); lag = (lsn[PRIMARY_NODE_ID] > lsn[i]) ? lsn[PRIMARY_NODE_ID] - lsn[i] : 0; if (PRIMARY_NODE_ID == i) { bkinfo->standby_delay = 0; } else { bkinfo->standby_delay = lag; /* Log delay if necessary */ if ((!strcmp(pool_config->log_standby_delay, "always") && lag > 0) || (pool_config->delay_threshold && !strcmp(pool_config->log_standby_delay, "if_over_threshold") && lag > pool_config->delay_threshold)) { pool_log("Replication of node:%d is behind %llu bytes from the primary server (node:%d)", i, lsn[PRIMARY_NODE_ID] - lsn[i], PRIMARY_NODE_ID); } } } }