/* * open xlog file */ static void openXlogEnd(XLogRecPtr *endLocation) { char path[MAXPGPATH]; uint32 logid; uint32 seg; char *xlogDir = NULL; XLByteToSeg(*endLocation, logid, seg); XLogFileName(xlogfilename, ThisTimeLineID, logid, seg); elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: opening logid %d seg %d file %s", logid, seg, xlogfilename); xlogDir = makeRelativeToTxnFilespace(XLOGDIR); /* the first time or file is changed */ if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, xlogfilename) >= MAXPGPATH) { ereport(ERROR, (errcode_for_file_access(), errmsg("QDSYNC: could not create xlog file \"%s/%s\"", xlogDir, xlogfilename))); } if (xlogfilefd >= 0) { close(xlogfilefd); xlogfilefd = -1; xlogfileoffset = -1; } xlogfilefd = open(path, O_RDWR, 0); if (xlogfilefd < 0) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: creating xlog file %s", xlogfilename); xlogfilefd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (xlogfilefd < 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("QDSYNC: could not create xlog file \"%s\"", path))); } } xlogid = logid; xseg = seg; xlogfileoffset = endLocation->xrecoff % XLogSegSize; elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"QDSYNC: opened '%s' offset 0x%X", xlogfilename, xlogfileoffset); pfree(xlogDir); }
Datum pg_control_checkpoint(PG_FUNCTION_ARGS) { Datum values[19]; bool nulls[19]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; XLogSegNo segno; char xlogfilename[MAXFNAMELEN]; /* * Construct a tuple descriptor for the result row. This must match this * function's pg_proc entry! */ tupdesc = CreateTemplateTupleDesc(19, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "checkpoint_location", LSNOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prior_location", LSNOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "redo_location", LSNOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "redo_wal_file", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "timeline_id", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "prev_timeline_id", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "full_page_writes", BOOLOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 8, "next_xid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 9, "next_oid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "next_multixact_id", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "next_multi_offset", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 12, "oldest_xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 13, "oldest_xid_dbid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 14, "oldest_active_xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 15, "oldest_multi_xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 16, "oldest_multi_dbid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 17, "oldest_commit_ts_xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 18, "newest_commit_ts_xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 19, "checkpoint_time", TIMESTAMPTZOID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); /* Read the control file. */ ControlFile = get_controlfile(DataDir, NULL); /* * Calculate name of the WAL file containing the latest checkpoint's REDO * start point. */ XLByteToSeg(ControlFile->checkPointCopy.redo, segno); XLogFileName(xlogfilename, ControlFile->checkPointCopy.ThisTimeLineID, segno); /* Populate the values and null arrays */ values[0] = LSNGetDatum(ControlFile->checkPoint); nulls[0] = false; values[1] = LSNGetDatum(ControlFile->prevCheckPoint); nulls[1] = false; values[2] = LSNGetDatum(ControlFile->checkPointCopy.redo); nulls[2] = false; values[3] = CStringGetTextDatum(xlogfilename); nulls[3] = false; values[4] = Int32GetDatum(ControlFile->checkPointCopy.ThisTimeLineID); nulls[4] = false; values[5] = Int32GetDatum(ControlFile->checkPointCopy.PrevTimeLineID); nulls[5] = false; values[6] = BoolGetDatum(ControlFile->checkPointCopy.fullPageWrites); nulls[6] = false; values[7] = CStringGetTextDatum(psprintf("%u:%u", ControlFile->checkPointCopy.nextXidEpoch, ControlFile->checkPointCopy.nextXid)); nulls[7] = false; values[8] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid); nulls[8] = false; values[9] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMulti); nulls[9] = false; values[10] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMultiOffset); nulls[10] = false; values[11] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestXid); nulls[11] = false; values[12] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestXidDB); nulls[12] = false; values[13] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestActiveXid); nulls[13] = false; values[14] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestMulti); nulls[14] = false; values[15] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestMultiDB); nulls[15] = false; values[16] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestCommitTsXid); nulls[16] = false; values[17] = TransactionIdGetDatum(ControlFile->checkPointCopy.newestCommitTsXid); nulls[17] = false; values[18] = TimestampTzGetDatum( time_t_to_timestamptz(ControlFile->checkPointCopy.time)); nulls[18] = false; htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); }
/* * Read 'count' bytes from WAL into 'buf', starting at location 'startptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; retry: p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; p += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startptr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } /* * During recovery, the currently-open WAL file might be replaced with * the file of the same name retrieved from archive. So we always need * to check what we read was valid after reading into the buffer. If it's * invalid, we try to open and read the file again. */ if (am_cascading_walsender) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; bool reload; SpinLockAcquire(&walsnd->mutex); reload = walsnd->needreload; walsnd->needreload = false; SpinLockRelease(&walsnd->mutex); if (reload && sendFile >= 0) { close(sendFile); sendFile = -1; goto retry; } } }
/* * Reserve WAL for the currently active slot. * * Compute and set restart_lsn in a manner that's appropriate for the type of * the slot and concurrency safe. */ void ReplicationSlotReserveWal(void) { ReplicationSlot *slot = MyReplicationSlot; Assert(slot != NULL); Assert(slot->data.restart_lsn == InvalidXLogRecPtr); /* * The replication slot mechanism is used to prevent removal of required * WAL. As there is no interlock between this routine and checkpoints, WAL * segments could concurrently be removed when a now stale return value of * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that * this happens we'll just retry. */ while (true) { XLogSegNo segno; /* * For logical slots log a standby snapshot and start logical decoding * at exactly that position. That allows the slot to start up more * quickly. * * That's not needed (or indeed helpful) for physical slots as they'll * start replay at the last logged checkpoint anyway. Instead return * the location of the last redo LSN. While that slightly increases * the chance that we have to retry, it's where a base backup has to * start replay at. */ if (!RecoveryInProgress() && SlotIsLogical(slot)) { XLogRecPtr flushptr; /* start at current insert position */ slot->data.restart_lsn = GetXLogInsertRecPtr(); /* make sure we have enough information to start */ flushptr = LogStandbySnapshot(); /* and make sure it's fsynced to disk */ XLogFlush(flushptr); } else { slot->data.restart_lsn = GetRedoRecPtr(); } /* prevent WAL removal as fast as possible */ ReplicationSlotsComputeRequiredLSN(); /* * If all required WAL is still there, great, otherwise retry. The * slot should prevent further removal of WAL, unless there's a * concurrent ReplicationSlotsComputeRequiredLSN() after we've written * the new restart_lsn above, so normally we should never need to loop * more than twice. */ XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); if (XLogGetLastRemovedSegno() < segno) break; } }
/* * Read 'nbytes' bytes from WAL into 'buf', starting at location 'recptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) { XLogRecPtr startRecPtr = recptr; char path[MAXPGPATH]; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, buf, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; buf += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startRecPtr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } }
static void WalSendServerDoRequest(WalSendRequest *walSendRequest) { bool successful; struct timeval standbyTimeout; WalSendServerGetStandbyTimeout(&standbyTimeout); switch (walSendRequest->command) { case PositionToEnd: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "PositionToEnd"); successful = write_position_to_end(&originalEndLocation, NULL, &walsend_shutdown_requested); if (successful) elog(LOG,"Standby master returned transaction log end location %s", XLogLocationToString(&originalEndLocation)); else { disableQDMirroring_ConnectionError( "Unable to connect to standby master and determine transaction log end location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } break; case Catchup: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Catchup"); if (isQDMirroringCatchingUp()) { bool tooFarBehind = false; elog(LOG,"Current master transaction log is flushed through location %s", XLogLocationToString(&walSendRequest->flushedLocation)); if (XLByteLT(originalEndLocation, walSendRequest->flushedLocation)) { /* * Standby master is behind the primary. Send catchup WAL. */ /* * Use a TRY block to catch errors from our attempt to read * the primary's WAL. Errors from sending to the standby * come up as a boolean return (successful). */ PG_TRY(); { successful = XLogCatchupQDMirror( &originalEndLocation, &walSendRequest->flushedLocation, &standbyTimeout, &walsend_shutdown_requested); } PG_CATCH(); { /* * Report the error related to reading the primary's WAL * to the server log */ /* * But first demote the error to something much less * scary. */ if (!elog_demote(WARNING)) { elog(LOG,"unable to demote error"); PG_RE_THROW(); } EmitErrorReport(); FlushErrorState(); successful = false; tooFarBehind = true; } PG_END_TRY(); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "catchup send from standby end %s through primary flushed location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); } } else if (XLByteEQ(originalEndLocation, walSendRequest->flushedLocation)) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Mirror was already caught up"); successful = true; } else { elog(WARNING,"Standby master transaction log location %s is beyond the current master end location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); successful = false; } if (successful) { char detail[200]; int count; count = snprintf( detail, sizeof(detail), "Transaction log copied from locations %s through %s to the standby master", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); if (count >= sizeof(detail)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("format command string failure"))); } enableQDMirroring("Master mirroring is now synchronized", detail); currentEndLocation = walSendRequest->flushedLocation; periodicLen = 0; periodicLocation = currentEndLocation; } else { if (tooFarBehind) { disableQDMirroring_TooFarBehind( "The current master was unable to synchronize the standby master " "because the transaction logs on the current master were recycled. " "A gpinitstandby (at an appropriate time) will be necessary to copy " "over the whole master database to the standby master so it may be synchronized"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost during transaction log catchup", GetStandbyErrorString()); } disconnectMirrorQD_SendClose(); } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not catching-up (state is disabled)"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case WriteWalPages: if (Debug_print_qd_mirroring) elog(LOG, "WriteWalPages"); if (isQDMirroringEnabled()) { char *from; Size nbytes; bool more= false; /* * For now, save copy of data until flush. This could be * optimized. */ if (saveBuffer == NULL) { uint32 totalBufferLen = XLOGbuffers * XLOG_BLCKSZ; saveBuffer = malloc(totalBufferLen); if (saveBuffer == NULL) elog(ERROR,"Could not allocate buffer for xlog data (%d bytes)", totalBufferLen); saveBufferLen = 0; } XLogGetBuffer(walSendRequest->startidx, walSendRequest->npages, &from, &nbytes); if (saveBufferLen == 0) { more = false; writeLogId = walSendRequest->logId; writeLogSeg = walSendRequest->logSeg; writeLogOff = walSendRequest->logOff; memcpy(saveBuffer, from, nbytes); saveBufferLen = nbytes; } else { more = true; memcpy(&saveBuffer[saveBufferLen], from, nbytes); saveBufferLen += nbytes; } if (Debug_print_qd_mirroring) elog(LOG, "Master Mirror Send: WriteWalPages (%s) startidx %d, npages %d, timeLineID %d, logId %u, logSeg %u, logOff 0x%X, nbytes 0x%X", (more ? "more" : "new"), walSendRequest->startidx, walSendRequest->npages, walSendRequest->timeLineID, walSendRequest->logId, walSendRequest->logSeg, walSendRequest->logOff, (int)nbytes); } case FlushWalPages: if (Debug_print_qd_mirroring) elog(LOG, "FlushWalPages"); if (isQDMirroringEnabled()) { char cmd[MAXFNAMELEN + 50]; if (saveBufferLen == 0) successful = true; else { if (snprintf(cmd, sizeof(cmd),"xlog %d %d %d %d", writeLogId, writeLogSeg, writeLogOff, (int)saveBufferLen) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d", writeLogId, writeLogSeg); successful = write_qd_sync(cmd, saveBuffer, saveBufferLen, &standbyTimeout, &walsend_shutdown_requested); if (successful) { XLogRecPtr oldEndLocation; oldEndLocation = currentEndLocation; currentEndLocation.xlogid = writeLogId; currentEndLocation.xrecoff = writeLogSeg * XLogSegSize + writeLogOff; if (currentEndLocation.xrecoff >= XLogFileSize) { (currentEndLocation.xlogid)++; currentEndLocation.xrecoff = 0; } if (XLByteLT(oldEndLocation,currentEndLocation)) { periodicLen += saveBufferLen; if (periodicLen > periodicReportLen) { elog(LOG, "Master mirroring periodic report: %d bytes successfully send to standby master for locations %s through %s", periodicLen, XLogLocationToString(&periodicLocation), XLogLocationToString2(¤tEndLocation)); periodicLen = 0; periodicLocation = currentEndLocation; } } else { if (Debug_print_qd_mirroring) elog(LOG, "Send to Master mirror successful. New end location %s (old %s)", XLogLocationToString(¤tEndLocation), XLogLocationToString2(&oldEndLocation)); } } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new transaction log", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } /* * Reset so WriteWalPages can fill the buffer again. */ saveBufferLen = 0; writeLogId = 0; writeLogSeg = 0; writeLogOff = 0; } if (successful && walSendRequest->haveNewCheckpointLocation) { uint32 logid; uint32 seg; uint32 offset; elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"New previous checkpoint location %s", XLogLocationToString(&walSendRequest->newCheckpointLocation)); XLByteToSeg(walSendRequest->newCheckpointLocation, logid, seg); offset = walSendRequest->newCheckpointLocation.xrecoff % XLogSegSize; if (snprintf(cmd, sizeof(cmd),"new_checkpoint_location %d %d %d", logid, seg, offset) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d offset %d", logid, seg, offset); successful = write_qd_sync(cmd, NULL, 0, NULL, &walsend_shutdown_requested); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Send of new checkpoint location to master mirror successful"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new checkpoint location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not enabled"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case CloseForShutdown: if (Debug_print_qd_mirroring) elog(LOG, "CloseForShutdown"); /* * Do the work we would normally do when signaled to stop. */ WalSendServer_ServiceShutdown(); break; default: elog(ERROR, "Unknown WalSendRequestCommand %d", walSendRequest->command); } }
/* * Attempt to retrieve the specified file from off-line archival storage. * If successful, fill "path" with its complete path (note that this will be * a temp file name that doesn't follow the normal naming convention), and * return TRUE. * * If not successful, fill "path" with the name of the normal on-line file * (which may or may not actually exist, but we'll try to use it), and return * FALSE. * * For fixed-size files, the caller may pass the expected size as an * additional crosscheck on successful recovery. If the file size is not * known, set expectedSize = 0. * * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments * in the archive. This is used when fetching the initial checkpoint record, * when we are not yet sure how far back we need the WAL. */ bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled) { char xlogpath[MAXPGPATH]; char xlogRestoreCmd[MAXPGPATH]; char lastRestartPointFname[MAXPGPATH]; char *dp; char *endp; const char *sp; int rc; bool signaled; struct stat stat_buf; XLogSegNo restartSegNo; XLogRecPtr restartRedoPtr; TimeLineID restartTli; /* In standby mode, restore_command might not be supplied */ if (recoveryRestoreCommand == NULL) goto not_available; /* * When doing archive recovery, we always prefer an archived log file even * if a file of the same name exists in XLOGDIR. The reason is that the * file in XLOGDIR could be an old, un-filled or partly-filled version * that was copied and restored as part of backing up $PGDATA. * * We could try to optimize this slightly by checking the local copy * lastchange timestamp against the archived copy, but we have no API to * do this, nor can we guarantee that the lastchange timestamp was * preserved correctly when we copied to archive. Our aim is robustness, * so we elect not to do this. * * If we cannot obtain the log file from the archive, however, we will try * to use the XLOGDIR file if it exists. This is so that we can make use * of log segments that weren't yet transferred to the archive. * * Notice that we don't actually overwrite any files when we copy back * from archive because the restore_command may inadvertently restore * inappropriate xlogs, or they may be corrupt, so we may wish to fallback * to the segments remaining in current XLOGDIR later. The * copy-from-archive filename is always the same, ensuring that we don't * run out of disk space on long recoveries. */ snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername); /* * Make sure there is no existing file named recovername. */ if (stat(xlogpath, &stat_buf) != 0) { if (errno != ENOENT) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath))); } else { if (unlink(xlogpath) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", xlogpath))); } /* * Calculate the archive file cutoff point for use during log shipping * replication. All files earlier than this point can be deleted from the * archive, though there is no requirement to do so. * * If cleanup is not enabled, initialise this with the filename of * InvalidXLogRecPtr, which will prevent the deletion of any WAL files * from the archive because of the alphabetic sorting property of WAL * filenames. * * Once we have successfully located the redo pointer of the checkpoint * from which we start recovery we never request a file prior to the redo * pointer of the last restartpoint. When redo begins we know that we have * successfully located it, so there is no need for additional status * flags to signify the point when we can begin deleting WAL files from * the archive. */ if (cleanupEnabled) { GetOldestRestartPoint(&restartRedoPtr, &restartTli); XLByteToSeg(restartRedoPtr, restartSegNo); XLogFileName(lastRestartPointFname, restartTli, restartSegNo); /* we shouldn't need anything earlier than last restart point */ Assert(strcmp(lastRestartPointFname, xlogfname) <= 0); } else XLogFileName(lastRestartPointFname, 0, 0L); /* * construct the command to be executed */ dp = xlogRestoreCmd; endp = xlogRestoreCmd + MAXPGPATH - 1; *endp = '\0'; for (sp = recoveryRestoreCommand; *sp; sp++) { if (*sp == '%') { switch (sp[1]) { case 'p': /* %p: relative path of target file */ sp++; StrNCpy(dp, xlogpath, endp - dp); make_native_path(dp); dp += strlen(dp); break; case 'f': /* %f: filename of desired file */ sp++; StrNCpy(dp, xlogfname, endp - dp); dp += strlen(dp); break; case 'r': /* %r: filename of last restartpoint */ sp++; StrNCpy(dp, lastRestartPointFname, endp - dp); dp += strlen(dp); break; case '%': /* convert %% to a single % */ sp++; if (dp < endp) *dp++ = *sp; break; default: /* otherwise treat the % as not special */ if (dp < endp) *dp++ = *sp; break; } } else { if (dp < endp) *dp++ = *sp; } } *dp = '\0'; ereport(DEBUG3, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd))); /* * Check signals before restore command and reset afterwards. */ PreRestoreCommand(); /* * Copy xlog from archival storage to XLOGDIR */ rc = system(xlogRestoreCmd); PostRestoreCommand(); if (rc == 0) { /* * command apparently succeeded, but let's make sure the file is * really there now and has the correct size. */ if (stat(xlogpath, &stat_buf) == 0) { if (expectedSize > 0 && stat_buf.st_size != expectedSize) { int elevel; /* * If we find a partial file in standby mode, we assume it's * because it's just being copied to the archive, and keep * trying. * * Otherwise treat a wrong-sized file as FATAL to ensure the * DBA would notice it, but is that too strong? We could try * to plow ahead with a local copy of the file ... but the * problem is that there probably isn't one, and we'd * incorrectly conclude we've reached the end of WAL and we're * done recovering ... */ if (StandbyMode && stat_buf.st_size < expectedSize) elevel = DEBUG1; else elevel = FATAL; ereport(elevel, (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu", xlogfname, (unsigned long) stat_buf.st_size, (unsigned long) expectedSize))); return false; } else { ereport(LOG, (errmsg("restored log file \"%s\" from archive", xlogfname))); strcpy(path, xlogpath); return true; } } else { /* stat failed */ if (errno != ENOENT) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath))); } } /* * Remember, we rollforward UNTIL the restore fails so failure here is * just part of the process... that makes it difficult to determine * whether the restore failed because there isn't an archive to restore, * or because the administrator has specified the restore program * incorrectly. We have to assume the former. * * However, if the failure was due to any sort of signal, it's best to * punt and abort recovery. (If we "return false" here, upper levels will * assume that recovery is complete and start up the database!) It's * essential to abort on child SIGINT and SIGQUIT, because per spec * system() ignores SIGINT and SIGQUIT while waiting; if we see one of * those it's a good bet we should have gotten it too. * * On SIGTERM, assume we have received a fast shutdown request, and exit * cleanly. It's pure chance whether we receive the SIGTERM first, or the * child process. If we receive it first, the signal handler will call * proc_exit, otherwise we do it here. If we or the child process received * SIGTERM for any other reason than a fast shutdown request, postmaster * will perform an immediate shutdown when it sees us exiting * unexpectedly. * * Per the Single Unix Spec, shells report exit status > 128 when a called * command died on a signal. Also, 126 and 127 are used to report * problems such as an unfindable command; treat those as fatal errors * too. */ if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) proc_exit(1); signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(signaled ? FATAL : DEBUG2, (errmsg("could not restore file \"%s\" from archive: %s", xlogfname, wait_result_to_str(rc)))); not_available: /* * if an archived file is not available, there might still be a version of * this file in XLOGDIR, so return that as the filename to open. * * In many recovery scenarios we expect this to fail also, but if so that * just means we've reached the end of WAL. */ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); return false; }
/* * Attempt to execute an external shell command during recovery. * * 'command' is the shell command to be executed, 'commandName' is a * human-readable name describing the command emitted in the logs. If * 'failOnSignal' is true and the command is killed by a signal, a FATAL * error is thrown. Otherwise a WARNING is emitted. * * This is currently used for recovery_end_command and archive_cleanup_command. */ void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal) { char xlogRecoveryCmd[MAXPGPATH]; char lastRestartPointFname[MAXPGPATH]; char *dp; char *endp; const char *sp; int rc; bool signaled; XLogSegNo restartSegNo; XLogRecPtr restartRedoPtr; TimeLineID restartTli; Assert(command && commandName); /* * Calculate the archive file cutoff point for use during log shipping * replication. All files earlier than this point can be deleted from the * archive, though there is no requirement to do so. */ GetOldestRestartPoint(&restartRedoPtr, &restartTli); XLByteToSeg(restartRedoPtr, restartSegNo); XLogFileName(lastRestartPointFname, restartTli, restartSegNo); /* * construct the command to be executed */ dp = xlogRecoveryCmd; endp = xlogRecoveryCmd + MAXPGPATH - 1; *endp = '\0'; for (sp = command; *sp; sp++) { if (*sp == '%') { switch (sp[1]) { case 'r': /* %r: filename of last restartpoint */ sp++; StrNCpy(dp, lastRestartPointFname, endp - dp); dp += strlen(dp); break; case '%': /* convert %% to a single % */ sp++; if (dp < endp) *dp++ = *sp; break; default: /* otherwise treat the % as not special */ if (dp < endp) *dp++ = *sp; break; } } else { if (dp < endp) *dp++ = *sp; } } *dp = '\0'; ereport(DEBUG3, (errmsg_internal("executing %s \"%s\"", commandName, command))); /* * execute the constructed command */ rc = system(xlogRecoveryCmd); if (rc != 0) { /* * If the failure was due to any sort of signal, it's best to punt and * abort recovery. See also detailed comments on signals in * RestoreArchivedFile(). */ signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport((signaled && failOnSignal) ? FATAL : WARNING, /*------ translator: First %s represents a recovery.conf parameter name like "recovery_end_command", the 2nd is the value of that parameter, the third an already translated error message. */ (errmsg("%s \"%s\": %s", commandName, command, wait_result_to_str(rc)))); } }
/* * Write XLOG data to disk. */ static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) { int startoff; int byteswritten; while (nbytes > 0) { int segbytes; if (recvFile < 0 || !XLByteInSeg(recptr, recvId, recvSeg)) { bool use_existent; /* * fsync() and close current file before we switch to next one. We * would otherwise have to reopen this file to fsync it later */ if (recvFile >= 0) { XLogWalRcvFlush(); /* * XLOG segment files will be re-read by recovery in startup * process soon, so we don't advise the OS to release cache * pages associated with the file like XLogFileClose() does. */ if (close(recvFile) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not close log file %u, segment %u: %m", recvId, recvSeg))); } recvFile = -1; /* Create/use new log file */ XLByteToSeg(recptr, recvId, recvSeg); use_existent = true; recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true); recvOff = 0; } /* Calculate the start offset of the received logs */ startoff = recptr.xrecoff % XLogSegSize; if (startoff + nbytes > XLogSegSize) segbytes = XLogSegSize - startoff; else segbytes = nbytes; /* Need to seek in the file? */ if (recvOff != startoff) { if (lseek(recvFile, (off_t) startoff, SEEK_SET) < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not seek in log file %u, " "segment %u to offset %u: %m", recvId, recvSeg, startoff))); recvOff = startoff; } /* OK to write the logs */ errno = 0; byteswritten = write(recvFile, buf, segbytes); if (byteswritten <= 0) { /* if write didn't set errno, assume no disk space */ if (errno == 0) errno = ENOSPC; ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to log file %u, segment %u " "at offset %u, length %lu: %m", recvId, recvSeg, recvOff, (unsigned long) segbytes))); } /* Update state for write */ XLByteAdvance(recptr, byteswritten); recvOff += byteswritten; nbytes -= byteswritten; buf += byteswritten; LogstreamResult.Write = recptr; } }
int do_delete(pgBackupRange *range) { int i; int ret; parray *backup_list; bool do_delete = false; XLogRecPtr oldest_lsn = InvalidXLogRecPtr; TimeLineID oldest_tli; /* DATE are always required */ if (!pgBackupRangeIsValid(range)) elog(ERROR, "required delete range option not specified: delete DATE"); /* Lock backup catalog */ ret = catalog_lock(); if (ret == -1) elog(ERROR, "can't lock backup catalog."); else if (ret == 1) elog(ERROR, "another pg_arman is running, stop delete."); /* Get complete list of backups */ backup_list = catalog_get_backup_list(NULL); if (!backup_list) elog(ERROR, "No backup list found, can't process any more."); /* Find backups to be deleted */ for (i = 0; i < parray_num(backup_list); i++) { pgBackup *backup = (pgBackup *) parray_get(backup_list, i); /* delete backup and update status to DELETED */ if (do_delete) { /* check for interrupt */ if (interrupted) elog(ERROR, "interrupted during delete backup"); pgBackupDeleteFiles(backup); continue; } /* Found the latest full backup */ if (backup->backup_mode >= BACKUP_MODE_FULL && backup->status == BACKUP_STATUS_OK && backup->start_time <= range->begin) { do_delete = true; oldest_lsn = backup->start_lsn; oldest_tli = backup->tli; } } /* release catalog lock */ catalog_unlock(); /* cleanup */ parray_walk(backup_list, pgBackupFree); parray_free(backup_list); /* * Delete in archive WAL segments that are not needed anymore. The oldest * segment to be kept is the first segment that the oldest full backup * found around needs to keep. */ if (!XLogRecPtrIsInvalid(oldest_lsn)) { XLogSegNo targetSegNo; char oldestSegmentNeeded[MAXFNAMELEN]; DIR *arcdir; struct dirent *arcde; char wal_file[MAXPGPATH]; int rc; XLByteToSeg(oldest_lsn, targetSegNo); XLogFileName(oldestSegmentNeeded, oldest_tli, targetSegNo); elog(LOG, "Removing segments older than %s", oldestSegmentNeeded); /* * Now is time to do the actual work and to remove all the segments * not needed anymore. */ if ((arcdir = opendir(arclog_path)) != NULL) { while (errno = 0, (arcde = readdir(arcdir)) != NULL) { /* * We ignore the timeline part of the XLOG segment identifiers in * deciding whether a segment is still needed. This ensures that * we won't prematurely remove a segment from a parent timeline. * We could probably be a little more proactive about removing * segments of non-parent timelines, but that would be a whole lot * more complicated. * * We use the alphanumeric sorting property of the filenames to * decide which ones are earlier than the exclusiveCleanupFileName * file. Note that this means files are not removed in the order * they were originally written, in case this worries you. */ if ((IsXLogFileName(arcde->d_name) || IsPartialXLogFileName(arcde->d_name)) && strcmp(arcde->d_name + 8, oldestSegmentNeeded + 8) < 0) { /* * Use the original file name again now, including any * extension that might have been chopped off before testing * the sequence. */ snprintf(wal_file, MAXPGPATH, "%s/%s", arclog_path, arcde->d_name); rc = unlink(wal_file); if (rc != 0) { elog(WARNING, "could not remove file \"%s\": %s", wal_file, strerror(errno)); break; } elog(LOG, "removed WAL segment \"%s\"", wal_file); } } if (errno) elog(WARNING, "could not read archive location \"%s\": %s", arclog_path, strerror(errno)); if (closedir(arcdir)) elog(WARNING, "could not close archive location \"%s\": %s", arclog_path, strerror(errno)); } else elog(WARNING, "could not open archive location \"%s\": %s", arclog_path, strerror(errno)); } return 0; }
/* * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but * we currently don't have the infrastructure (elog!) to share it. */ static void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; static int sendFile = -1; static XLogSegNo sendSegNo = 0; static uint32 sendOff = 0; p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendSegNo); XLogFilePath(path, tli, sendSegNo); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { if (errno == ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", path))); else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %m", path, startoff))); } sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %lu: %m", path, sendOff, (unsigned long) segbytes))); } /* Update state for read */ recptr += readbytes; sendOff += readbytes; nbytes -= readbytes; p += readbytes; } }
void cdb_perform_redo(XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint, XLogRecPtr *newCheckpointLoc) { CheckPoint oldRedoCheckpoint; uint32 logid; uint32 seg; int nsegsremoved; if (redoCheckPointLoc->xlogid == 0 && redoCheckPointLoc->xrecoff == 0) { XLogGetRecoveryStart("QDSYNC", "for redo apply", redoCheckPointLoc, redoCheckPoint); } XLogStandbyRecoverRange(redoCheckPointLoc, redoCheckPoint, newCheckpointLoc); /* * Sample the recovery start location now to see if appling redo * processed checkpoint records and moved the restart location forward. */ oldRedoCheckpoint = *redoCheckPoint; XLogGetRecoveryStart("QDSYNC", "for redo progress check", redoCheckPointLoc, redoCheckPoint); if (XLByteLT(oldRedoCheckpoint.redo,redoCheckPoint->redo)) { ereport(LOG, (errmsg("QDSYNC: transaction redo moved the restart location from %s to %s", XLogLocationToString(&oldRedoCheckpoint.redo), XLogLocationToString2(&redoCheckPoint->redo)))); } else { Assert(XLByteEQ(oldRedoCheckpoint.redo,redoCheckPoint->redo)); ereport(LOG, (errmsg("QDSYNC: transaction redo did not move the restart location %s forward this pass", XLogLocationToString(&oldRedoCheckpoint.redo)))); return; } XLByteToSeg(redoCheckPoint->redo, logid, seg); /* * Delete offline log files (those no longer needed even for previous * checkpoint). */ elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: keep log files as far back as (logid %d, seg %d)", logid, seg); if (logid || seg) { PrevLogSeg(logid, seg); elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: delete offline log files up to (logid %d, seg %d)", logid, seg); XLogRemoveStandbyLogs(logid, seg, &nsegsremoved); if (nsegsremoved > 0) { // Throw in extra new line to make log more readable. ereport(LOG, (errmsg("QDSYNC: %d logs removed through logid %d, seg %d\n", nsegsremoved, logid, seg))); } } // Throw in extra new line to make log more readable. elog(LOG,"--------------------------"); }