/* * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the * WAL buffers to disk before returning to the caller. Writing the WAL record * does not guarantee successful completion of the barrier command. */ void ProcessCreateBarrierExecute(const char *id) { StringInfoData buf; if (!IsConnFromCoord()) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("The CREATE BARRIER EXECUTE message is expected to " "arrive from a Coordinator"))); { XLogRecData rdata[1]; XLogRecPtr recptr; rdata[0].data = (char *) id; rdata[0].len = strlen(id) + 1; rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); XLogFlush(recptr); } pq_beginmessage(&buf, 'b'); pq_sendstring(&buf, id); pq_endmessage(&buf); pq_flush(); }
/* ---------------- * ReadyForQuery - tell dest that we are ready for a new query * * The ReadyForQuery message is sent in protocol versions 2.0 and up * so that the FE can tell when we are done processing a query string. * In versions 3.0 and up, it also carries a transaction state indicator. * * Note that by flushing the stdio buffer here, we can avoid doing it * most other places and thus reduce the number of separate packets sent. * ---------------- */ void ReadyForQuery(CommandDest dest) { switch (dest) { case DestRemote: case DestRemoteExecute: if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) { StringInfoData buf; pq_beginmessage(&buf, 'Z'); pq_sendbyte(&buf, TransactionBlockStatusCode()); pq_endmessage(&buf); } else if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 2) pq_putemptymessage('Z'); /* Flush output at end of cycle in any case. */ pq_flush(); break; case DestNone: case DestDebug: case DestSPI: case DestTuplestore: case DestIntoRel: case DestCopyOut: break; } }
static Source * CreateRemoteSource(const char *path, TupleDesc desc) { RemoteSource *self = (RemoteSource *) palloc0(sizeof(RemoteSource)); self->base.close = (SourceCloseProc) RemoteSourceClose; if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) { /* new way */ StringInfoData buf; int16 format; int nattrs; int i; self->base.read = (SourceReadProc) RemoteSourceRead; /* count valid fields */ for (nattrs = 0, i = 0; i < desc->natts; i++) { if (desc->attrs[i]->attisdropped) continue; nattrs++; } format = (IsBinaryCopy() ? 1 : 0); pq_beginmessage(&buf, 'G'); pq_sendbyte(&buf, format); /* overall format */ pq_sendint(&buf, nattrs, 2); for (i = 0; i < nattrs; i++) pq_sendint(&buf, format, 2); /* per-column formats */ pq_endmessage(&buf); self->buffer = makeStringInfo(); } else if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 2) { self->base.read = (SourceReadProc) RemoteSourceReadOld; /* old way */ if (IsBinaryCopy()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY BINARY is not supported to stdout or from stdin"))); pq_putemptymessage('G'); } else { self->base.read = (SourceReadProc) RemoteSourceReadOld; /* very old way */ if (IsBinaryCopy()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY BINARY is not supported to stdout or from stdin"))); pq_putemptymessage('D'); } /* We *must* flush here to ensure FE knows it can send. */ pq_flush(); return (Source *) self; }
static void putEndLocationReply(XLogRecPtr *endLocation) { StringInfoData buf; pq_beginmessage(&buf, 's'); pq_sendint(&buf, endLocation->xlogid, 4); pq_sendint(&buf, endLocation->xrecoff, 4); pq_endmessage(&buf); pq_flush(); }
/* * START_REPLICATION */ static void StartReplication(StartReplicationCmd *cmd) { StringInfoData buf; /* * Let postmaster know that we're streaming. Once we've declared us as a * WAL sender process, postmaster will let us outlive the bgwriter and * kill us last in the shutdown sequence, so we get a chance to stream all * remaining WAL at shutdown, including the shutdown checkpoint. Note that * there's no going back, and we mustn't write any WAL records after this. */ MarkPostmasterChildWalSender(); SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE); /* * Check that we're logging enough information in the WAL for * log-shipping. * * NOTE: This only checks the current value of wal_level. Even if the * current setting is not 'minimal', there can be old WAL in the pg_xlog * directory that was created with 'minimal'. So this is not bulletproof, * the purpose is just to give a user-friendly error message that hints * how to configure the system correctly. */ if (wal_level == WAL_LEVEL_MINIMAL) ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("standby connections not allowed because wal_level=minimal"))); /* * When we first start replication the standby will be behind the primary. * For some applications, for example, synchronous replication, it is * important to have a clear state for this initial catchup mode, so we * can trigger actions when we change streaming state later. We may stay * in this state for a long time, which is exactly why we want to be able * to monitor whether or not we are still here. */ WalSndSetState(WALSNDSTATE_CATCHUP); /* Send a CopyBothResponse message, and start streaming */ pq_beginmessage(&buf, 'W'); pq_sendbyte(&buf, 0); pq_sendint(&buf, 0, 2); pq_endmessage(&buf); pq_flush(); /* * Initialize position to the received one, then the xlog records begin to * be shipped from that position */ sentPtr = cmd->startpoint; }
/* * START_REPLICATION */ static void StartReplication(StartReplicationCmd * cmd) { StringInfoData buf; /* * Let postmaster know that we're streaming. Once we've declared us as * a WAL sender process, postmaster will let us outlive the bgwriter and * kill us last in the shutdown sequence, so we get a chance to stream * all remaining WAL at shutdown, including the shutdown checkpoint. * Note that there's no going back, and we mustn't write any WAL records * after this. */ MarkPostmasterChildWalSender(); /* * Check that we're logging enough information in the WAL for * log-shipping. * * NOTE: This only checks the current value of wal_level. Even if the * current setting is not 'minimal', there can be old WAL in the pg_xlog * directory that was created with 'minimal'. So this is not bulletproof, * the purpose is just to give a user-friendly error message that hints * how to configure the system correctly. */ if (wal_level == WAL_LEVEL_MINIMAL) ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("standby connections not allowed because wal_level=minimal"))); /* Send a CopyBothResponse message, and start streaming */ pq_beginmessage(&buf, 'W'); pq_sendbyte(&buf, 0); pq_sendint(&buf, 0, 2); pq_endmessage(&buf); pq_flush(); /* * Initialize position to the received one, then the xlog records begin to * be shipped from that position */ sentPtr = cmd->startpoint; }
/* * Send an authentication request packet to the frontend. */ static void sendAuthRequest(Port *port, AuthRequest areq) { StringInfoData buf; pq_beginmessage(&buf, 'R'); pq_sendint(&buf, (int32) areq, sizeof(int32)); /* Add the salt for encrypted passwords. */ if (areq == AUTH_REQ_MD5) pq_sendbytes(&buf, port->md5Salt, 4); else if (areq == AUTH_REQ_CRYPT) pq_sendbytes(&buf, port->cryptSalt, 2); #if defined(ENABLE_GSS) || defined(ENABLE_SSPI) /* * Add the authentication data for the next step of the GSSAPI or SSPI * negotiation. */ else if (areq == AUTH_REQ_GSS_CONT) { if (port->gss->outbuf.length > 0) { elog(DEBUG4, "sending GSS token of length %u", (unsigned int) port->gss->outbuf.length); pq_sendbytes(&buf, port->gss->outbuf.value, port->gss->outbuf.length); } } #endif pq_endmessage(&buf); /* * Flush message so client will see it, except for AUTH_REQ_OK, which need * not be sent until we are ready for queries. */ if (areq != AUTH_REQ_OK) pq_flush(); }
void ProcessGTMEndBackup(Port *myport, StringInfo message) { int ii; GTM_ThreadInfo *my_threadinfo; StringInfoData buf; pq_getmsgend(message); my_threadinfo = GetMyThreadInfo; for (ii = 0; ii < GTMThreads->gt_array_size; ii++) { if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) GTM_RWLockRelease(>MThreads->gt_threads[ii]->thr_lock); } my_threadinfo->thr_status = GTM_THREAD_RUNNING; pq_beginmessage(&buf, 'S'); pq_sendint(&buf, END_BACKUP_RESULT, 4); pq_endmessage(myport, &buf); pq_flush(myport); }
void ProcessGTMBeginBackup(Port *myport, StringInfo message) { int ii; GTM_ThreadInfo *my_threadinfo; StringInfoData buf; pq_getmsgend(message); my_threadinfo = GetMyThreadInfo; for (ii = 0; ii < GTMThreads->gt_array_size; ii++) { if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) GTM_RWLockAcquire(>MThreads->gt_threads[ii]->thr_lock, GTM_LOCKMODE_WRITE); } my_threadinfo->thr_status = GTM_THREAD_BACKUP; pq_beginmessage(&buf, 'S'); pq_sendint(&buf, BEGIN_BACKUP_RESULT, 4); pq_endmessage(myport, &buf); pq_flush(myport); }
static void send_buffer() { if (buffer_len > 0) { StringInfoData msgbuf; char *cursor = buffer; while (--buffer_len > 0) { if (*cursor == '\0') *cursor = '\n'; cursor++; } if (*cursor != '\0') ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("internal error"), errdetail("Wrong message format detected"))); pq_beginmessage(&msgbuf, 'N'); if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) { pq_sendbyte(&msgbuf, PG_DIAG_MESSAGE_PRIMARY); pq_sendstring(&msgbuf, buffer); pq_sendbyte(&msgbuf, '\0'); } else { *cursor++ = '\n'; *cursor = '\0'; pq_sendstring(&msgbuf, buffer); } pq_endmessage(&msgbuf); pq_flush(); } }
/* * Mark the completion of an on-going barrier. We must have remembered the * barrier ID when we received the CREATE BARRIER PREPARE command */ void ProcessCreateBarrierEnd(const char *id) { StringInfoData buf; if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("The CREATE BARRIER END message is expected to " "arrive at a Coordinator from another Coordinator"))); LWLockRelease(BarrierLock); pq_beginmessage(&buf, 'b'); pq_sendstring(&buf, id); pq_endmessage(&buf); pq_flush(); /* * TODO Stop the timer */ }
/* * Prepare ourselves for an incoming BARRIER. We must disable all new 2PC * commits and let the ongoing commits to finish. We then remember the * barrier id (so that it can be matched with the final END message) and * tell the driving Coordinator to proceed with the next step. * * A simple way to implement this is to grab a lock in an exclusive mode * while all other backend starting a 2PC will grab the lock in shared * mode. So as long as we hold the exclusive lock, no other backend start a * new 2PC and there can not be any 2PC in-progress. This technique would * rely on assumption that an exclusive lock requester is not starved by * share lock requesters. * * Note: To ensure that the 2PC are not blocked for a long time, we should * set a timeout. The lock should be release after the timeout and the * barrier should be canceled. */ void ProcessCreateBarrierPrepare(const char *id) { StringInfoData buf; if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("The CREATE BARRIER PREPARE message is expected to " "arrive at a Coordinator from another Coordinator"))); LWLockAcquire(BarrierLock, LW_EXCLUSIVE); pq_beginmessage(&buf, 'b'); pq_sendstring(&buf, id); pq_endmessage(&buf); pq_flush(); /* * TODO Start a timer to terminate the pending barrier after a specified * timeout */ }
/* * Send an authentication request packet to the frontend. */ static void sendAuthRequest(Port *port, AuthRequest areq) { StringInfoData buf; pq_beginmessage(&buf, 'R'); pq_sendint(&buf, (int32) areq, sizeof(int32)); /* Add the salt for encrypted passwords. */ if (areq == AUTH_REQ_MD5) pq_sendbytes(&buf, port->md5Salt, 4); else if (areq == AUTH_REQ_CRYPT) pq_sendbytes(&buf, port->cryptSalt, 2); pq_endmessage(&buf); /* * Flush message so client will see it, except for AUTH_REQ_OK, which * need not be sent until we are ready for queries. */ if (areq != AUTH_REQ_OK) pq_flush(); }
/* * Execute commands from walreceiver, until we enter streaming mode. */ static void WalSndHandshake(void) { StringInfoData input_message; bool replication_started = false; initStringInfo(&input_message); while (!replication_started) { int firstchar; /* Wait for a command to arrive */ firstchar = pq_getbyte(); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Check for any other interesting events that happened while we * slept. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (firstchar != EOF) { /* * Read the message contents. This is expected to be done without * blocking because we've been able to get message type code. */ if (pq_getmessage(&input_message, 0)) firstchar = EOF; /* suitable message already logged */ } /* Handle the very limited subset of commands expected in this phase */ switch (firstchar) { case 'Q': /* Query message */ { const char *query_string; XLogRecPtr recptr; query_string = pq_getmsgstring(&input_message); pq_getmsgend(&input_message); if (strcmp(query_string, "IDENTIFY_SYSTEM") == 0) { StringInfoData buf; char sysid[32]; char tli[11]; /* * Reply with a result set with one row, two columns. * First col is system ID, and second is timeline ID */ snprintf(sysid, sizeof(sysid), UINT64_FORMAT, GetSystemIdentifier()); snprintf(tli, sizeof(tli), "%u", ThisTimeLineID); /* Send a RowDescription message */ pq_beginmessage(&buf, 'T'); pq_sendint(&buf, 2, 2); /* 2 fields */ /* first field */ pq_sendstring(&buf, "systemid"); /* col name */ pq_sendint(&buf, 0, 4); /* table oid */ pq_sendint(&buf, 0, 2); /* attnum */ pq_sendint(&buf, TEXTOID, 4); /* type oid */ pq_sendint(&buf, -1, 2); /* typlen */ pq_sendint(&buf, 0, 4); /* typmod */ pq_sendint(&buf, 0, 2); /* format code */ /* second field */ pq_sendstring(&buf, "timeline"); /* col name */ pq_sendint(&buf, 0, 4); /* table oid */ pq_sendint(&buf, 0, 2); /* attnum */ pq_sendint(&buf, INT4OID, 4); /* type oid */ pq_sendint(&buf, 4, 2); /* typlen */ pq_sendint(&buf, 0, 4); /* typmod */ pq_sendint(&buf, 0, 2); /* format code */ pq_endmessage(&buf); /* Send a DataRow message */ pq_beginmessage(&buf, 'D'); pq_sendint(&buf, 2, 2); /* # of columns */ pq_sendint(&buf, strlen(sysid), 4); /* col1 len */ pq_sendbytes(&buf, (char *) &sysid, strlen(sysid)); pq_sendint(&buf, strlen(tli), 4); /* col2 len */ pq_sendbytes(&buf, (char *) tli, strlen(tli)); pq_endmessage(&buf); /* Send CommandComplete and ReadyForQuery messages */ EndCommand("SELECT", DestRemote); ReadyForQuery(DestRemote); /* ReadyForQuery did pq_flush for us */ } else if (sscanf(query_string, "START_REPLICATION %X/%X", &recptr.xlogid, &recptr.xrecoff) == 2) { StringInfoData buf; /* * Check that we're logging enough information in the * WAL for log-shipping. * * NOTE: This only checks the current value of * wal_level. Even if the current setting is not * 'minimal', there can be old WAL in the pg_xlog * directory that was created with 'minimal'. So this * is not bulletproof, the purpose is just to give a * user-friendly error message that hints how to * configure the system correctly. */ if (wal_level == WAL_LEVEL_MINIMAL) ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("standby connections not allowed because wal_level=minimal"))); /* Send a CopyOutResponse message, and start streaming */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); pq_sendint(&buf, 0, 2); pq_endmessage(&buf); pq_flush(); /* * Initialize position to the received one, then the * xlog records begin to be shipped from that position */ sentPtr = recptr; /* break out of the loop */ replication_started = true; } else { ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby query string: %s", query_string))); } break; } case 'X': /* standby is closing the connection */ proc_exit(0); case EOF: /* standby disconnected unexpectedly */ ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF on standby connection"))); proc_exit(0); default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby handshake message type %d", firstchar))); } } }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * When SIGUSR2 arrives, we send all outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (ready_to_stop) { if (!XLogSend(output_message, &caughtup)) break; if (caughtup) shutdown_requested = true; } /* Normal exit from the walsender is here */ if (shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we had sent all accumulated WAL in last round, nap for the * configured time before retrying. */ if (caughtup) { /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch * again, and after that check that we're still up-to-date. */ ResetLatch(&MyWalSnd->latch); if (!XLogSend(output_message, &caughtup)) break; if (caughtup && !got_SIGHUP && !ready_to_stop && !shutdown_requested) { /* * XXX: We don't really need the periodic wakeups anymore, * WaitLatchOrSocket should reliably wake up as soon as * something interesting happens. */ /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, WalSndDelay * 1000L); } /* Check if the connection was closed */ CheckClosedConnection(); } else { /* Attempt to send the log once every loop */ if (!XLogSend(output_message, &caughtup)) break; } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* Clear any already-pending wakeups */ ResetLatch(&MyWalSnd->latch); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming is done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* Check for input from the client */ ProcessRepliesIfAny(); /* * If we don't have any pending data in the output buffer, try to send * some more. If there is some, we don't bother to call XLogSend * again until we've flushed it ... but we'd better assume we are not * caught up. */ if (!pq_is_send_pending()) XLogSend(output_message, &caughtup); else caughtup = false; /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* If nothing remains to be sent right now ... */ if (caughtup && !pq_is_send_pending()) { /* * If we're in catchup state, move to streaming. This is an * important state change for users to know about, since before * this point data loss might occur if the primary dies and we * need to failover to the standby. The state change is also * important for synchronous replication, since commits that * started to wait at that point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. * This may be a normal termination at shutdown, or a promotion, * the walsender is not sure which. */ if (walsender_ready_to_stop) { /* ... let's just be real sure we're caught up ... */ XLogSend(output_message, &caughtup); if (caughtup && !pq_is_send_pending()) { walsender_shutdown_requested = true; continue; /* don't want to wait more */ } } } /* * We don't block if not caught up, unless there is unsent data * pending in which case we'd better block until the socket is * write-ready. This test is only needed for the case where XLogSend * loaded a subset of the available data but then pq_flush_if_writable * flushed it all --- we should immediately try to send more. */ if (caughtup || pq_is_send_pending()) { TimestampTz finish_time = 0; long sleeptime = -1; int wakeEvents; wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE; if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; /* Determine time until replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; /* Avoid Assert in WaitLatchOrSocket if timeout is past */ if (sleeptime < 0) sleeptime = 0; wakeEvents |= WL_TIMEOUT; } /* Sleep until something happens or replication timeout */ WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents, MyProcPort->sock, sleeptime); /* * Check for replication timeout. Note we ignore the corner case * possibility that the client replied just as we reached the * timeout ... he's supposed to reply *before* that. */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * START_REPLICATION */ static void StartReplication(StartReplicationCmd *cmd) { StringInfoData buf; /* * Let postmaster know that we're streaming. Once we've declared us as a * WAL sender process, postmaster will let us outlive the bgwriter and * kill us last in the shutdown sequence, so we get a chance to stream all * remaining WAL at shutdown, including the shutdown checkpoint. Note that * there's no going back, and we mustn't write any WAL records after this. */ MarkPostmasterChildWalSender(); SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE); /* * When promoting a cascading standby, postmaster sends SIGUSR2 to * any cascading walsenders to kill them. But there is a corner-case where * such walsender fails to receive SIGUSR2 and survives a standby promotion * unexpectedly. This happens when postmaster sends SIGUSR2 before * the walsender marks itself as a WAL sender, because postmaster sends * SIGUSR2 to only the processes marked as a WAL sender. * * To avoid this corner-case, if recovery is NOT in progress even though * the walsender is cascading one, we do the same thing as SIGUSR2 signal * handler does, i.e., set walsender_ready_to_stop to true. Which causes * the walsender to end later. * * When terminating cascading walsenders, usually postmaster writes * the log message announcing the terminations. But there is a race condition * here. If there is no walsender except this process before reaching here, * postmaster thinks that there is no walsender and suppresses that * log message. To handle this case, we always emit that log message here. * This might cause duplicate log messages, but which is less likely to happen, * so it's not worth writing some code to suppress them. */ if (am_cascading_walsender && !RecoveryInProgress()) { ereport(LOG, (errmsg("terminating walsender process to force cascaded standby " "to update timeline and reconnect"))); walsender_ready_to_stop = true; } /* * We assume here that we're logging enough information in the WAL for * log-shipping, since this is checked in PostmasterMain(). * * NOTE: wal_level can only change at shutdown, so in most cases it is * difficult for there to be WAL data that we can still see that was written * at wal_level='minimal'. */ /* * When we first start replication the standby will be behind the primary. * For some applications, for example, synchronous replication, it is * important to have a clear state for this initial catchup mode, so we * can trigger actions when we change streaming state later. We may stay * in this state for a long time, which is exactly why we want to be able * to monitor whether or not we are still here. */ WalSndSetState(WALSNDSTATE_CATCHUP); /* Send a CopyBothResponse message, and start streaming */ pq_beginmessage(&buf, 'W'); pq_sendbyte(&buf, 0); pq_sendint(&buf, 0, 2); pq_endmessage(&buf); pq_flush(); /* * Initialize position to the received one, then the xlog records begin to * be shipped from that position */ sentPtr = cmd->startpoint; }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we don't have any pending data in the output buffer, try to send * some more. */ if (!pq_is_send_pending()) { XLogSend(output_message, &caughtup); /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch again, * and after that check that we're still up-to-date. */ if (caughtup && !pq_is_send_pending()) { ResetLatch(&MyWalSnd->latch); XLogSend(output_message, &caughtup); } } /* Flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (walsender_ready_to_stop && !pq_is_send_pending()) { XLogSend(output_message, &caughtup); ProcessRepliesIfAny(); if (caughtup && !pq_is_send_pending()) walsender_shutdown_requested = true; } if ((caughtup || pq_is_send_pending()) && !got_SIGHUP && !walsender_shutdown_requested) { TimestampTz finish_time = 0; long sleeptime; /* Reschedule replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; if (WalSndDelay < sleeptime) sleeptime = WalSndDelay; } else { /* * XXX: Without timeout, we don't really need the periodic * wakeups anymore, WaitLatchOrSocket should reliably wake up * as soon as something interesting happens. */ sleeptime = WalSndDelay; } /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, true, pq_is_send_pending(), sleeptime); /* Check for replication timeout */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } /* * If we're in catchup state, see if its time to move to streaming. * This is an important state change for users, since before this * point data loss might occur if the primary dies and we need to * failover to the standby. The state change is also important for * synchronous replication, since commits that started to wait at that * point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } ProcessRepliesIfAny(); } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, * but not yet sent to the client, and send it. * * msgbuf is a work area in which the output message is constructed. It's * passed in just so we can avoid re-palloc'ing the buffer on each cycle. * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE. * * If there is no unsent WAL remaining, *caughtup is set to true, otherwise * *caughtup is set to false. * * Returns true if OK, false if trouble. */ static bool XLogSend(char *msgbuf, bool *caughtup) { XLogRecPtr SendRqstPtr; XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes; WalDataMessageHeader msghdr; /* * Attempt to send all data that's already been written out and fsync'd to * disk. We cannot go further than what's been written out given the * current implementation of XLogRead(). And in any case it's unsafe to * send WAL that is not securely down to disk on the master: if the master * subsequently crashes and restarts, slaves must not have applied any WAL * that gets lost on the master. */ SendRqstPtr = GetFlushRecPtr(); /* Quick exit if nothing to do */ if (XLByteLE(SendRqstPtr, sentPtr)) { *caughtup = true; return true; } /* * Figure out how much to send in one message. If there's no more than * MAX_SEND_SIZE bytes to send, send everything. Otherwise send * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. * * The rounding is not only for performance reasons. Walreceiver relies on * the fact that we never split a WAL record across two messages. Since a * long WAL record is split at page boundary into continuation records, * page boundary is always a safe cut-off point. We also assume that * SendRqstPtr never points to the middle of a WAL record. */ startptr = sentPtr; if (startptr.xrecoff >= XLogFileSize) { /* * crossing a logid boundary, skip the non-existent last log segment * in previous logical log file. */ startptr.xlogid += 1; startptr.xrecoff = 0; } endptr = startptr; XLByteAdvance(endptr, MAX_SEND_SIZE); if (endptr.xlogid != startptr.xlogid) { /* Don't cross a logfile boundary within one message */ Assert(endptr.xlogid == startptr.xlogid + 1); endptr.xlogid = startptr.xlogid; endptr.xrecoff = XLogFileSize; } /* if we went beyond SendRqstPtr, back off */ if (XLByteLE(SendRqstPtr, endptr)) { endptr = SendRqstPtr; *caughtup = true; } else { /* round down to page boundary. */ endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ); *caughtup = false; } nbytes = endptr.xrecoff - startptr.xrecoff; Assert(nbytes <= MAX_SEND_SIZE); /* * OK to read and send the slice. */ msgbuf[0] = 'w'; /* * Read the log directly into the output buffer to avoid extra memcpy * calls. */ XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes); /* * We fill the message header last so that the send timestamp is taken as * late as possible. */ msghdr.dataStart = startptr; msghdr.walEnd = SendRqstPtr; msghdr.sendTime = GetCurrentTimestamp(); memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader)); pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes); /* Flush pending output to the client */ if (pq_flush()) return false; sentPtr = endptr; /* Update shared memory status */ { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; SpinLockAcquire(&walsnd->mutex); walsnd->sentPtr = sentPtr; SpinLockRelease(&walsnd->mutex); } /* Report progress of XLOG streaming in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", sentPtr.xlogid, sentPtr.xrecoff); set_ps_display(activitymsg, false); } return true; }
/* * -------------------------------------------------------------- * ProcessIncomingNotify * * Deal with arriving NOTIFYs from other backends. * This is called either directly from the SIGUSR2 signal handler, * or the next time control reaches the outer idle loop. * Scan pg_listener for arriving notifies, report them to my front end, * and clear the notification field in pg_listener until next time. * * NOTE: since we are outside any transaction, we must create our own. * -------------------------------------------------------------- */ static void ProcessIncomingNotify(void) { Relation lRel; TupleDesc tdesc; ScanKeyData key[1]; HeapScanDesc scan; HeapTuple lTuple, rTuple; Datum value[Natts_pg_listener]; char repl[Natts_pg_listener], nulls[Natts_pg_listener]; bool catchup_enabled; /* Must prevent SIGUSR1 interrupt while I am running */ catchup_enabled = DisableCatchupInterrupt(); if (Trace_notify) elog(DEBUG1, "ProcessIncomingNotify"); set_ps_display("notify interrupt", false); notifyInterruptOccurred = 0; StartTransactionCommand(); lRel = heap_open(ListenerRelationId, ExclusiveLock); tdesc = RelationGetDescr(lRel); /* Scan only entries with my listenerPID */ ScanKeyInit(&key[0], Anum_pg_listener_pid, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(MyProcPid)); scan = heap_beginscan(lRel, SnapshotNow, 1, key); /* Prepare data for rewriting 0 into notification field */ nulls[0] = nulls[1] = nulls[2] = ' '; repl[0] = repl[1] = repl[2] = ' '; repl[Anum_pg_listener_notify - 1] = 'r'; value[0] = value[1] = value[2] = (Datum) 0; value[Anum_pg_listener_notify - 1] = Int32GetDatum(0); while ((lTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Form_pg_listener listener = (Form_pg_listener) GETSTRUCT(lTuple); char *relname = NameStr(listener->relname); int32 sourcePID = listener->notification; if (sourcePID != 0) { /* Notify the frontend */ if (Trace_notify) elog(DEBUG1, "ProcessIncomingNotify: received %s from %d", relname, (int) sourcePID); NotifyMyFrontEnd(relname, sourcePID); /* * Rewrite the tuple with 0 in notification column. * * simple_heap_update is safe here because no one else would have * tried to UNLISTEN us, so there can be no uncommitted changes. */ rTuple = heap_modifytuple(lTuple, tdesc, value, nulls, repl); simple_heap_update(lRel, &lTuple->t_self, rTuple); #ifdef NOT_USED /* currently there are no indexes */ CatalogUpdateIndexes(lRel, rTuple); #endif } } heap_endscan(scan); /* * We do NOT release the lock on pg_listener here; we need to hold it * until end of transaction (which is about to happen, anyway) to ensure * that other backends see our tuple updates when they look. Otherwise, a * transaction started after this one might mistakenly think it doesn't * need to send this backend a new NOTIFY. */ heap_close(lRel, NoLock); CommitTransactionCommand(); /* * Must flush the notify messages to ensure frontend gets them promptly. */ pq_flush(); set_ps_display("idle", false); if (Trace_notify) elog(DEBUG1, "ProcessIncomingNotify: done"); if (catchup_enabled) EnableCatchupInterrupt(); }
static void MPPnoticeReceiver(void * arg, const PGresult * res) { PQExpBufferData msgbuf; PGMessageField *pfield; int elevel = INFO; char * sqlstate = "00000"; char * severity = "WARNING"; char * file = ""; char * line = NULL; char * func = ""; char message[1024]; char * detail = NULL; char * hint = NULL; char * context = NULL; SegmentDatabaseDescriptor *segdbDesc = (SegmentDatabaseDescriptor *) arg; if (!res) return; strcpy(message,"missing error text"); for (pfield = res->errFields; pfield != NULL; pfield = pfield->next) { switch (pfield->code) { case PG_DIAG_SEVERITY: severity = pfield->contents; if (strcmp(pfield->contents,"WARNING")==0) elevel = WARNING; else if (strcmp(pfield->contents,"NOTICE")==0) elevel = NOTICE; else if (strcmp(pfield->contents,"DEBUG1")==0 || strcmp(pfield->contents,"DEBUG")==0) elevel = DEBUG1; else if (strcmp(pfield->contents,"DEBUG2")==0) elevel = DEBUG2; else if (strcmp(pfield->contents,"DEBUG3")==0) elevel = DEBUG3; else if (strcmp(pfield->contents,"DEBUG4")==0) elevel = DEBUG4; else if (strcmp(pfield->contents,"DEBUG5")==0) elevel = DEBUG5; else elevel = INFO; break; case PG_DIAG_SQLSTATE: sqlstate = pfield->contents; break; case PG_DIAG_MESSAGE_PRIMARY: strncpy(message, pfield->contents, 800); message[800] = '\0'; if (segdbDesc && segdbDesc->whoami && strlen(segdbDesc->whoami) < 200) { strcat(message," ("); strcat(message, segdbDesc->whoami); strcat(message,")"); } break; case PG_DIAG_MESSAGE_DETAIL: detail = pfield->contents; break; case PG_DIAG_MESSAGE_HINT: hint = pfield->contents; break; case PG_DIAG_STATEMENT_POSITION: case PG_DIAG_INTERNAL_POSITION: case PG_DIAG_INTERNAL_QUERY: break; case PG_DIAG_CONTEXT: context = pfield->contents; break; case PG_DIAG_SOURCE_FILE: file = pfield->contents; break; case PG_DIAG_SOURCE_LINE: line = pfield->contents; break; case PG_DIAG_SOURCE_FUNCTION: func = pfield->contents; break; case PG_DIAG_GP_PROCESS_TAG: break; default: break; } } if (elevel < client_min_messages && elevel != INFO) return; /* * We use PQExpBufferData instead of StringInfoData * because the former uses malloc, the latter palloc. * We are in a thread, and we CANNOT use palloc since it's not * thread safe. We cannot call elog or ereport either for the * same reason. */ initPQExpBuffer(&msgbuf); if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) { /* New style with separate fields */ appendPQExpBufferChar(&msgbuf, PG_DIAG_SEVERITY); appendBinaryPQExpBuffer(&msgbuf, severity, strlen(severity)+1); appendPQExpBufferChar(&msgbuf, PG_DIAG_SQLSTATE); appendBinaryPQExpBuffer(&msgbuf, sqlstate, strlen(sqlstate)+1); /* M field is required per protocol, so always send something */ appendPQExpBufferChar(&msgbuf, PG_DIAG_MESSAGE_PRIMARY); appendBinaryPQExpBuffer(&msgbuf, message , strlen(message) + 1); if (detail) { appendPQExpBufferChar(&msgbuf, PG_DIAG_MESSAGE_DETAIL); appendBinaryPQExpBuffer(&msgbuf, detail, strlen(detail)+1); } if (hint) { appendPQExpBufferChar(&msgbuf, PG_DIAG_MESSAGE_HINT); appendBinaryPQExpBuffer(&msgbuf, hint, strlen(hint)+1); } if (context) { appendPQExpBufferChar(&msgbuf, PG_DIAG_CONTEXT); appendBinaryPQExpBuffer(&msgbuf, context, strlen(context)+1); } if (file) { appendPQExpBufferChar(&msgbuf, PG_DIAG_SOURCE_FILE); appendBinaryPQExpBuffer(&msgbuf, file, strlen(file)+1); } if (line) { appendPQExpBufferChar(&msgbuf, PG_DIAG_SOURCE_LINE); appendBinaryPQExpBuffer(&msgbuf, line, strlen(line)+1); } if (func) { appendPQExpBufferChar(&msgbuf, PG_DIAG_SOURCE_FUNCTION); appendBinaryPQExpBuffer(&msgbuf, func, strlen(func)+1); } } else { appendPQExpBuffer(&msgbuf, "%s: ", severity); appendBinaryPQExpBuffer(&msgbuf, message, strlen(message)); appendPQExpBufferChar(&msgbuf, '\n'); appendPQExpBufferChar(&msgbuf, '\0'); } appendPQExpBufferChar(&msgbuf, '\0'); /* terminator */ pq_putmessage('N', msgbuf.data, msgbuf.len); termPQExpBuffer(&msgbuf); pq_flush(); }
/* * Receive Startup packet * Response Client Authentication */ int FileRepConnServer_ReceiveStartupPacket(void) { uint32 length; int status = STATUS_OK; char *buf = NULL; pq_init(); status = FileRepConnServer_ReceiveMessageLength(&length); if (status != STATUS_OK) { goto exit; } if (length < (uint32) sizeof(ProtocolVersion) || length > MAX_STARTUP_PACKET_LENGTH) { status = STATUS_ERROR; ereport(WARNING, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid length of startup packet"), FileRep_errcontext())); goto exit; } buf = (char *)malloc(length +1); if (buf == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough memory to allocate buffer for startup packet"), FileRep_errcontext())); } memset(buf, 0, length + 1); if (pq_getbytes(buf, length) == EOF) { status = STATUS_ERROR; ereport(WARNING, (errcode_for_socket_access(), errmsg("receive EOF on connection: %m"), FileRep_errcontext())); goto exit; } port->proto = ntohl(*((ProtocolVersion *) buf)); if (PG_PROTOCOL_MAJOR(port->proto) >= 3) { /* uint32 offset = sizeof(ProtocolVersion);*/ /* * tell the client that it is authorized (no pg_hba.conf and * password are required). */ StringInfoData buf; /* sends AUTH_REQ_OK back to client */ FakeClientAuthentication(port); /* send to client that we are ready to receive data */ /* similar to ReadyForQuery(DestRemoteExecute); */ pq_beginmessage(&buf, 'Z'); pq_sendbyte(&buf, 'I'); pq_endmessage(&buf); pq_flush(); } else { ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("not supported version"), FileRep_errcontext())); } exit: if (buf) { free(buf); buf = NULL; } return status; }