/* * Read 'count' bytes from WAL into 'buf', starting at location 'startptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; retry: p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; p += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startptr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } /* * During recovery, the currently-open WAL file might be replaced with * the file of the same name retrieved from archive. So we always need * to check what we read was valid after reading into the buffer. If it's * invalid, we try to open and read the file again. */ if (am_cascading_walsender) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; bool reload; SpinLockAcquire(&walsnd->mutex); reload = walsnd->needreload; walsnd->needreload = false; SpinLockRelease(&walsnd->mutex); if (reload && sendFile >= 0) { close(sendFile); sendFile = -1; goto retry; } } }
/* * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, * but not yet sent to the client, and buffer it in the libpq output * buffer. * * msgbuf is a work area in which the output message is constructed. It's * passed in just so we can avoid re-palloc'ing the buffer on each cycle. * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE. * * If there is no unsent WAL remaining, *caughtup is set to true, otherwise * *caughtup is set to false. */ static void XLogSend(char *msgbuf, bool *caughtup) { XLogRecPtr SendRqstPtr; XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes; WalDataMessageHeader msghdr; /* * Attempt to send all data that's already been written out and fsync'd to * disk. We cannot go further than what's been written out given the * current implementation of XLogRead(). And in any case it's unsafe to * send WAL that is not securely down to disk on the master: if the master * subsequently crashes and restarts, slaves must not have applied any WAL * that gets lost on the master. */ SendRqstPtr = am_cascading_walsender ? GetStandbyFlushRecPtr() : GetFlushRecPtr(); /* Quick exit if nothing to do */ if (XLByteLE(SendRqstPtr, sentPtr)) { *caughtup = true; return; } /* * Figure out how much to send in one message. If there's no more than * MAX_SEND_SIZE bytes to send, send everything. Otherwise send * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. * * The rounding is not only for performance reasons. Walreceiver relies on * the fact that we never split a WAL record across two messages. Since a * long WAL record is split at page boundary into continuation records, * page boundary is always a safe cut-off point. We also assume that * SendRqstPtr never points to the middle of a WAL record. */ startptr = sentPtr; if (startptr.xrecoff >= XLogFileSize) { /* * crossing a logid boundary, skip the non-existent last log segment * in previous logical log file. */ startptr.xlogid += 1; startptr.xrecoff = 0; } endptr = startptr; XLByteAdvance(endptr, MAX_SEND_SIZE); if (endptr.xlogid != startptr.xlogid) { /* Don't cross a logfile boundary within one message */ Assert(endptr.xlogid == startptr.xlogid + 1); endptr.xlogid = startptr.xlogid; endptr.xrecoff = XLogFileSize; } /* if we went beyond SendRqstPtr, back off */ if (XLByteLE(SendRqstPtr, endptr)) { endptr = SendRqstPtr; *caughtup = true; } else { /* round down to page boundary. */ endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ); *caughtup = false; } nbytes = endptr.xrecoff - startptr.xrecoff; Assert(nbytes <= MAX_SEND_SIZE); /* * OK to read and send the slice. */ msgbuf[0] = 'w'; /* * Read the log directly into the output buffer to avoid extra memcpy * calls. */ XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes); /* * We fill the message header last so that the send timestamp is taken as * late as possible. */ msghdr.dataStart = startptr; msghdr.walEnd = SendRqstPtr; msghdr.sendTime = GetCurrentTimestamp(); memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader)); pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes); sentPtr = endptr; /* Update shared memory status */ { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; SpinLockAcquire(&walsnd->mutex); walsnd->sentPtr = sentPtr; SpinLockRelease(&walsnd->mutex); } /* Report progress of XLOG streaming in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", sentPtr.xlogid, sentPtr.xrecoff); set_ps_display(activitymsg, false); } return; }
/* * Receive a log stream starting at the specified position. * * If sysidentifier is specified, validate that both the system * identifier and the timeline matches the specified ones * (by sending an extra IDENTIFY_SYSTEM command) * * All received segments will be written to the directory * specified by basedir. * * The stream_stop callback will be called every time data * is received, and whenever a segment is completed. If it returns * true, the streaming will stop and the function * return. As long as it returns false, streaming will continue * indefinitely. * * standby_message_timeout controls how often we send a message * back to the master letting it know our progress, in seconds. * This message will only contain the write location, and never * flush or replay. * * Note: The log position *must* be at a log segment start! */ bool ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysidentifier, char *basedir, stream_stop_callback stream_stop, int standby_message_timeout, bool rename_partial) { char query[128]; char current_walfile_name[MAXPGPATH]; PGresult *res; char *copybuf = NULL; int64 last_status = -1; XLogRecPtr blockpos = InvalidXLogRecPtr; if (sysidentifier != NULL) { /* Validate system identifier and timeline hasn't changed */ res = PQexec(conn, "IDENTIFY_SYSTEM"); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn)); PQclear(res); return false; } if (PQnfields(res) != 3 || PQntuples(res) != 1) { fprintf(stderr, _("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"), progname, PQntuples(res), PQnfields(res), 1, 3); PQclear(res); return false; } if (strcmp(sysidentifier, PQgetvalue(res, 0, 0)) != 0) { fprintf(stderr, _("%s: system identifier does not match between base backup and streaming connection\n"), progname); PQclear(res); return false; } if (timeline != atoi(PQgetvalue(res, 0, 1))) { fprintf(stderr, _("%s: timeline does not match between base backup and streaming connection\n"), progname); PQclear(res); return false; } PQclear(res); } /* Initiate the replication stream at specified location */ snprintf(query, sizeof(query), "START_REPLICATION %X/%X", startpos.xlogid, startpos.xrecoff); res = PQexec(conn, query); if (PQresultStatus(res) != PGRES_COPY_BOTH) { fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), progname, "START_REPLICATION", PQresultErrorMessage(res)); PQclear(res); return false; } PQclear(res); /* * Receive the actual xlog data */ while (1) { int r; int xlogoff; int bytes_left; int bytes_written; int64 now; if (copybuf != NULL) { PQfreemem(copybuf); copybuf = NULL; } /* * Check if we should continue streaming, or abort at this point. */ if (stream_stop && stream_stop(blockpos, timeline, false)) { if (walfile != -1 && !close_walfile(basedir, current_walfile_name, rename_partial)) /* Potential error message is written by close_walfile */ goto error; return true; } /* * Potentially send a status message to the master */ now = localGetCurrentTimestamp(); if (standby_message_timeout > 0 && localTimestampDifferenceExceeds(last_status, now, standby_message_timeout)) { /* Time to send feedback! */ char replybuf[sizeof(StandbyReplyMessage) + 1]; StandbyReplyMessage *replymsg; replymsg = (StandbyReplyMessage *) (replybuf + 1); replymsg->write = blockpos; replymsg->flush = InvalidXLogRecPtr; replymsg->apply = InvalidXLogRecPtr; replymsg->sendTime = now; replybuf[0] = 'r'; if (PQputCopyData(conn, replybuf, sizeof(replybuf)) <= 0 || PQflush(conn)) { fprintf(stderr, _("%s: could not send feedback packet: %s"), progname, PQerrorMessage(conn)); goto error; } last_status = now; } r = PQgetCopyData(conn, ©buf, 1); if (r == 0) { /* * In async mode, and no data available. We block on reading but * not more than the specified timeout, so that we can send a * response back to the client. */ fd_set input_mask; struct timeval timeout; struct timeval *timeoutptr; FD_ZERO(&input_mask); FD_SET(PQsocket(conn), &input_mask); if (standby_message_timeout) { TimestampTz targettime; long secs; int usecs; targettime = TimestampTzPlusMilliseconds(last_status, standby_message_timeout - 1); localTimestampDifference(now, targettime, &secs, &usecs); if (secs <= 0) timeout.tv_sec = 1; /* Always sleep at least 1 sec */ else timeout.tv_sec = secs; timeout.tv_usec = usecs; timeoutptr = &timeout; } else timeoutptr = NULL; r = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr); if (r == 0 || (r < 0 && errno == EINTR)) { /* * Got a timeout or signal. Continue the loop and either * deliver a status packet to the server or just go back into * blocking. */ continue; } else if (r < 0) { fprintf(stderr, _("%s: select() failed: %s\n"), progname, strerror(errno)); goto error; } /* Else there is actually data on the socket */ if (PQconsumeInput(conn) == 0) { fprintf(stderr, _("%s: could not receive data from WAL stream: %s"), progname, PQerrorMessage(conn)); goto error; } continue; } if (r == -1) /* End of copy stream */ break; if (r == -2) { fprintf(stderr, _("%s: could not read COPY data: %s"), progname, PQerrorMessage(conn)); goto error; } if (copybuf[0] == 'k') { /* * keepalive message, sent in 9.2 and newer. We just ignore this * message completely, but need to skip past it in the stream. */ if (r != STREAMING_KEEPALIVE_SIZE) { fprintf(stderr, _("%s: keepalive message has incorrect size %d\n"), progname, r); goto error; } continue; } else if (copybuf[0] != 'w') { fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"), progname, copybuf[0]); goto error; } if (r < STREAMING_HEADER_SIZE + 1) { fprintf(stderr, _("%s: streaming header too small: %d\n"), progname, r); goto error; } /* Extract WAL location for this block */ memcpy(&blockpos, copybuf + 1, 8); xlogoff = blockpos.xrecoff % XLOG_SEG_SIZE; /* * Verify that the initial location in the stream matches where we * think we are. */ if (walfile == -1) { /* No file open yet */ if (xlogoff != 0) { fprintf(stderr, _("%s: received transaction log record for offset %u with no file open\n"), progname, xlogoff); goto error; } } else { /* More data in existing segment */ /* XXX: store seek value don't reseek all the time */ if (lseek(walfile, 0, SEEK_CUR) != xlogoff) { fprintf(stderr, _("%s: got WAL data offset %08x, expected %08x\n"), progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR)); goto error; } } bytes_left = r - STREAMING_HEADER_SIZE; bytes_written = 0; while (bytes_left) { int bytes_to_write; /* * If crossing a WAL boundary, only write up until we reach * XLOG_SEG_SIZE. */ if (xlogoff + bytes_left > XLOG_SEG_SIZE) bytes_to_write = XLOG_SEG_SIZE - xlogoff; else bytes_to_write = bytes_left; if (walfile == -1) { walfile = open_walfile(blockpos, timeline, basedir, current_walfile_name); if (walfile == -1) /* Error logged by open_walfile */ goto error; } if (write(walfile, copybuf + STREAMING_HEADER_SIZE + bytes_written, bytes_to_write) != bytes_to_write) { fprintf(stderr, _("%s: could not write %u bytes to WAL file \"%s\": %s\n"), progname, bytes_to_write, current_walfile_name, strerror(errno)); goto error; } /* Write was successful, advance our position */ bytes_written += bytes_to_write; bytes_left -= bytes_to_write; XLByteAdvance(blockpos, bytes_to_write); xlogoff += bytes_to_write; /* Did we reach the end of a WAL segment? */ if (blockpos.xrecoff % XLOG_SEG_SIZE == 0) { if (!close_walfile(basedir, current_walfile_name, false)) /* Error message written in close_walfile() */ goto error; xlogoff = 0; if (stream_stop != NULL) { /* * Callback when the segment finished, and return if it * told us to. */ if (stream_stop(blockpos, timeline, true)) return true; } } } /* No more data left to write, start receiving next copy packet */ } /* * The only way to get out of the loop is if the server shut down the * replication stream. If it's a controlled shutdown, the server will send * a shutdown message, and we'll return the latest xlog location that has * been streamed. */ res = PQgetResult(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, _("%s: unexpected termination of replication stream: %s"), progname, PQresultErrorMessage(res)); goto error; } PQclear(res); /* Complain if we've not reached stop point yet */ if (stream_stop != NULL && !stream_stop(blockpos, timeline, false)) { fprintf(stderr, _("%s: replication stream was terminated before stop point\n"), progname); goto error; } if (copybuf != NULL) PQfreemem(copybuf); if (walfile != -1 && close(walfile) != 0) fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), progname, current_walfile_name, strerror(errno)); walfile = -1; return true; error: if (copybuf != NULL) PQfreemem(copybuf); if (walfile != -1 && close(walfile) != 0) fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), progname, current_walfile_name, strerror(errno)); walfile = -1; return false; }
/* * Read 'nbytes' bytes from WAL into 'buf', starting at location 'recptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) { XLogRecPtr startRecPtr = recptr; char path[MAXPGPATH]; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, buf, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; buf += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startRecPtr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } }
/* * Write XLOG data to disk. */ static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) { int startoff; int byteswritten; while (nbytes > 0) { int segbytes; if (recvFile < 0 || !XLByteInSeg(recptr, recvId, recvSeg)) { bool use_existent; /* * fsync() and close current file before we switch to next one. We * would otherwise have to reopen this file to fsync it later */ if (recvFile >= 0) { XLogWalRcvFlush(); /* * XLOG segment files will be re-read by recovery in startup * process soon, so we don't advise the OS to release cache * pages associated with the file like XLogFileClose() does. */ if (close(recvFile) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not close log file %u, segment %u: %m", recvId, recvSeg))); } recvFile = -1; /* Create/use new log file */ XLByteToSeg(recptr, recvId, recvSeg); use_existent = true; recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true); recvOff = 0; } /* Calculate the start offset of the received logs */ startoff = recptr.xrecoff % XLogSegSize; if (startoff + nbytes > XLogSegSize) segbytes = XLogSegSize - startoff; else segbytes = nbytes; /* Need to seek in the file? */ if (recvOff != startoff) { if (lseek(recvFile, (off_t) startoff, SEEK_SET) < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not seek in log file %u, " "segment %u to offset %u: %m", recvId, recvSeg, startoff))); recvOff = startoff; } /* OK to write the logs */ errno = 0; byteswritten = write(recvFile, buf, segbytes); if (byteswritten <= 0) { /* if write didn't set errno, assume no disk space */ if (errno == 0) errno = ENOSPC; ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to log file %u, segment %u " "at offset %u, length %lu: %m", recvId, recvSeg, recvOff, (unsigned long) segbytes))); } /* Update state for write */ XLByteAdvance(recptr, byteswritten); recvOff += byteswritten; nbytes -= byteswritten; buf += byteswritten; LogstreamResult.Write = recptr; } }