/* ---- * Manipulation of ondisk state of replication slots * * NB: none of the routines below should take any notice whether a slot is the * current one or not, that's all handled a layer above. * ---- */ static void CreateSlotOnDisk(ReplicationSlot *slot) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; struct stat st; /* * No need to take out the io_in_progress_lock, nobody else can see this * slot yet, so nobody else will write. We're reusing SaveSlotToPath which * takes out the lock, if we'd take the lock here, we'd deadlock. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * It's just barely possible that some previous effort to create or * drop a slot with this name left a temp directory lying around. * If that seems to be the case, try to remove it. If the rmtree() * fails, we'll error out at the mkdir() below, so we don't bother * checking success. */ if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode)) rmtree(tmppath, true); /* Create and fsync the temporary slot directory. */ if (mkdir(tmppath, S_IRWXU) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", tmppath))); fsync_fname(tmppath, true); /* Write the actual state file. */ slot->dirty = true; /* signal that we really need to write */ SaveSlotToPath(slot, tmppath, ERROR); /* Rename the directory into place. */ if (rename(tmppath, path) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); /* * If we'd now fail - really unlikely - we wouldn't know whether this slot * would persist after an OS crash or not - so, force a restart. The * restart would try to fysnc this again till it works. */ START_CRIT_SECTION(); fsync_fname(path, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); }
/* * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability * * Wrapper around rename, similar to the backend version. */ int durable_rename(const char *oldfile, const char *newfile, const char *progname) { int fd; /* * First fsync the old and target path (if it exists), to ensure that they * are properly persistent on disk. Syncing the target file is not * strictly necessary, but it makes it easier to reason about crashes; * because it's then guaranteed that either source or target file exists * after a crash. */ if (fsync_fname(oldfile, false, progname) != 0) return -1; fd = open(newfile, PG_BINARY | O_RDWR, 0); if (fd < 0) { if (errno != ENOENT) { fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), progname, newfile, strerror(errno)); return -1; } } else { if (fsync(fd) != 0) { fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), progname, newfile, strerror(errno)); close(fd); return -1; } close(fd); } /* Time to do the real deal... */ if (rename(oldfile, newfile) != 0) { fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"), progname, oldfile, newfile, strerror(errno)); return -1; } /* * To guarantee renaming the file is persistent, fsync the file with its * new name, and its containing directory. */ if (fsync_fname(newfile, false, progname) != 0) return -1; if (fsync_parent_path(newfile, progname) != 0) return -1; return 0; }
/* * Load all replication slots from disk into memory at server startup. This * needs to be run before we start crash recovery. */ void StartupReplicationSlots(XLogRecPtr checkPointRedo) { DIR *replication_dir; struct dirent *replication_de; ereport(DEBUG1, (errmsg("starting up replication slots"))); /* restore all slots by iterating over all on-disk entries */ replication_dir = AllocateDir("pg_replslot"); while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL) { struct stat statbuf; char path[MAXPGPATH]; if (strcmp(replication_de->d_name, ".") == 0 || strcmp(replication_de->d_name, "..") == 0) continue; snprintf(path, MAXPGPATH, "pg_replslot/%s", replication_de->d_name); /* we're only creating directories here, skip if it's not our's */ if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) continue; /* we crashed while a slot was being setup or deleted, clean up */ if (string_endswith(replication_de->d_name, ".tmp")) { if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); continue; } fsync_fname("pg_replslot", true); continue; } /* looks like a slot in a normal state, restore */ RestoreSlotFromDisk(replication_de->d_name); } FreeDir(replication_dir); /* currently no slots exist, we're done. */ if (max_replication_slots <= 0) return; /* Now that we have recovered all the data, compute replication xmin */ ReplicationSlotsComputeRequiredXmin(); ReplicationSlotsComputeRequiredLSN(); }
/* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointCommitTs(void) { /* Flush dirty CommitTs pages to disk */ SimpleLruFlush(CommitTsCtl, true); /* * fsync pg_commit_ts to ensure that any files flushed previously are * durably on disk. */ fsync_fname("pg_commit_ts", true); }
/* * fsync_parent_path -- fsync the parent path of a file or directory * * This is aimed at making file operations persistent on disk in case of * an OS crash or power failure. */ int fsync_parent_path(const char *fname, const char *progname) { char parentpath[MAXPGPATH]; strlcpy(parentpath, fname, MAXPGPATH); get_parent_directory(parentpath); /* * get_parent_directory() returns an empty string if the input argument is * just a file name (see comments in path.c), so handle that as being the * current directory. */ if (strlen(parentpath) == 0) strlcpy(parentpath, ".", MAXPGPATH); if (fsync_fname(parentpath, true, progname) != 0) return -1; return 0; }
/* * Permanently drop the replication slot which will be released by the point * this function returns. */ static void ReplicationSlotDropPtr(ReplicationSlot *slot) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive before bailing out. If we're dropping an ephemeral or a * temporary slot, we better never fail hard as the caller won't expect * the slot to survive and this might get called during error handling. */ if (rename(path, tmppath) == 0) { /* * We need to fsync() the directory we just renamed and its parent to * make sure that our changes are on disk in a crash-safe fashion. If * fsync() fails, we can't be sure whether the changes are on disk or * not. For now, we handle that by panicking; * StartupReplicationSlots() will try to straighten it out after * restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); } else { bool fail_softly = slot->data.persistency != RS_PERSISTENT; SpinLockAcquire(&slot->mutex); slot->active_pid = 0; SpinLockRelease(&slot->mutex); /* wake up anyone waiting on this slot */ ConditionVariableBroadcast(&slot->active_cv); ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, tmppath))); } /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active PID here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. * * Also wake up processes waiting for it. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active_pid = 0; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); ConditionVariableBroadcast(&slot->active_cv); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC); if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } pgstat_report_wait_end(); /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); pgstat_report_wait_end(); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); pgstat_report_wait_end(); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active_pid = 0; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ SpinLockAcquire(&slot->mutex); was_dirty = slot->dirty; slot->just_dirtied = false; SpinLockRelease(&slot->mutex); /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(&slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32C(cp.checksum); cp.version = SLOT_VERSION; cp.length = ReplicationSlotOnDiskV2Size; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32C(cp.checksum, (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); /* fsync the temporary file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_SYNC); if (pg_fsync(fd) != 0) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ SpinLockAcquire(&slot->mutex); if (!slot->just_dirtied) slot->dirty = false; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char slotdir[MAXPGPATH + 12]; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(slotdir, "pg_replslot/%s", name); sprintf(path, "%s/state.tmp", slotdir); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "%s/state", slotdir); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(slotdir, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { if (!rmtree(slotdir, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", slotdir))); } fsync_fname("pg_replslot", true); return; } /* * Verify that requirements for the specific slot type are met. That's * important because if these aren't met we're not guaranteed to retain * all the necessary resources for the slot. * * NB: We have to do so *after* the above checks for ephemeral slots, * because otherwise a slot that shouldn't exist anymore could prevent * restarts. * * NB: Changing the requirements here also requires adapting * CheckSlotRequirements() and CheckLogicalDecodingRequirements(). */ if (cp.slotdata.database != InvalidOid && wal_level < WAL_LEVEL_LOGICAL) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("logical replication slot \"%s\" exists, but wal_level < logical", NameStr(cp.slotdata.name)), errhint("Change wal_level to be logical or higher."))); else if (wal_level < WAL_LEVEL_ARCHIVE) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("physical replication slot \"%s\" exists, but wal_level < archive", NameStr(cp.slotdata.name)), errhint("Change wal_level to be archive or higher."))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * copydir: copy a directory * * If recurse is false, subdirectories are ignored. Anything that's not * a directory or a regular file is ignored. */ void copydir(char *fromdir, char *todir, bool recurse) { DIR *xldir; struct dirent *xlde; char fromfile[MAXPGPATH]; char tofile[MAXPGPATH]; if (mkdir(todir, S_IRWXU) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", todir))); xldir = AllocateDir(fromdir); if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fromdir))); while ((xlde = ReadDir(xldir, fromdir)) != NULL) { struct stat fst; /* If we got a cancel signal during the copy of the directory, quit */ CHECK_FOR_INTERRUPTS(); if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(fromfile, MAXPGPATH, "%s/%s", fromdir, xlde->d_name); snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); if (lstat(fromfile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fromfile))); if (S_ISDIR(fst.st_mode)) { /* recurse to handle subdirectories */ if (recurse) copydir(fromfile, tofile, true); } else if (S_ISREG(fst.st_mode)) copy_file(fromfile, tofile); } FreeDir(xldir); /* * Be paranoid here and fsync all files to ensure the copy is really done. * But if fsync is disabled, we're done. */ if (!enableFsync) return; xldir = AllocateDir(todir); if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", todir))); while ((xlde = ReadDir(xldir, todir)) != NULL) { struct stat fst; if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); /* * We don't need to sync subdirectories here since the recursive * copydir will do it before it returns */ if (lstat(tofile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", tofile))); if (S_ISREG(fst.st_mode)) fsync_fname(tofile, false); } FreeDir(xldir); /* * It's important to fsync the destination directory itself as individual * file fsyncs don't guarantee that the directory entry for the file is * synced. Recent versions of ext4 have made the window much wider but * it's been true for ext3 and other filesystems in the past. */ fsync_fname(todir, true); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not unlink file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskDynamicSize) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *)&cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC32 */ INIT_CRC32(checksum); COMP_CRC32(checksum, (char *)&cp + ReplicationSlotOnDiskConstantSize, ReplicationSlotOnDiskDynamicSize); if (!EQ_CRC32(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Permanently drop replication slot identified by the passed in name. */ void ReplicationSlotDrop(const char *name) { ReplicationSlot *slot = NULL; int i; bool active; char path[MAXPGPATH]; char tmppath[MAXPGPATH]; ReplicationSlotValidateName(name, ERROR); /* * If some other backend ran this code currently with us, we might both * try to free the same slot at the same time. Or we might try to delete * a slot with a certain name while someone else was trying to create a * slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Search for the named slot and mark it active if we find it. */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) { volatile ReplicationSlot *vslot = s; SpinLockAcquire(&s->mutex); active = vslot->active; vslot->active = true; SpinLockRelease(&s->mutex); slot = s; break; } } LWLockRelease(ReplicationSlotControlLock); /* If we did not find the slot or it was already active, error out. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("replication slot \"%s\" does not exist", name))); if (active) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is already active", name))); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive again before bailing out. */ if (rename(path, tmppath) != 0) { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&slot->mutex); vslot->active = false; SpinLockRelease(&slot->mutex); ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename \"%s\" to \"%s\": %m", path, tmppath))); } /* * We need to fsync() the directory we just renamed and its parent to make * sure that our changes are on disk in a crash-safe fashion. If fsync() * fails, we can't be sure whether the changes are on disk or not. For * now, we handle that by panicking; StartupReplicationSlots() will * try to straighten it out after restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active flag here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active = false; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Process one per-dbspace directory for ResetUnloggedRelations */ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { DIR *dbspace_dir; struct dirent *de; char rm_path[MAXPGPATH * 2]; /* Caller must specify at least one operation. */ Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); /* * Cleanup is a two-pass operation. First, we go through and identify all * the files with init forks. Then, we go through again and nuke * everything with the same OID except the init fork. */ if ((op & UNLOGGED_RELATION_CLEANUP) != 0) { HTAB *hash; HASHCTL ctl; /* * It's possible that someone could create a ton of unlogged relations * in the same database & tablespace, so we'd better use a hash table * rather than an array or linked list to keep track of which files * need to be reset. Otherwise, this cleanup operation would be * O(n^2). */ memset(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(unlogged_relation_entry); ctl.entrysize = sizeof(unlogged_relation_entry); hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM); /* Scan the directory. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* * Put the OID portion of the name into the hash table, if it * isn't already. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_ENTER, NULL); } /* Done with the first pass. */ FreeDir(dbspace_dir); /* * If we didn't find any init forks, there's no point in continuing; * we can bail out now. */ if (hash_get_num_entries(hash) == 0) { hash_destroy(hash); return; } /* * Now, make a second pass and remove anything that matches. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; bool found; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* We never remove the init fork. */ if (forkNum == INIT_FORKNUM) continue; /* * See whether the OID portion of the name shows up in the hash * table. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_FIND, &found); /* If so, nuke it! */ if (found) { snprintf(rm_path, sizeof(rm_path), "%s/%s", dbspacedirname, de->d_name); if (unlink(rm_path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", rm_path))); else elog(DEBUG2, "unlinked file \"%s\"", rm_path); } } /* Cleanup is complete. */ FreeDir(dbspace_dir); hash_destroy(hash); } /* * Initialization happens after cleanup is complete: we copy each init * fork file to the corresponding main fork file. Note that if we are * asked to do both cleanup and init, we may never get here: if the * cleanup code determines that there are no init forks in this dbspace, * it will return before we get to this point. */ if ((op & UNLOGGED_RELATION_INIT) != 0) { /* Scan the directory. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; char oidbuf[OIDCHARS + 1]; char srcpath[MAXPGPATH * 2]; char dstpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* Construct source pathname. */ snprintf(srcpath, sizeof(srcpath), "%s/%s", dbspacedirname, de->d_name); /* Construct destination pathname. */ memcpy(oidbuf, de->d_name, oidchars); oidbuf[oidchars] = '\0'; snprintf(dstpath, sizeof(dstpath), "%s/%s%s", dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); /* OK, we're ready to perform the actual copy. */ elog(DEBUG2, "copying %s to %s", srcpath, dstpath); copy_file(srcpath, dstpath); } FreeDir(dbspace_dir); /* * copy_file() above has already called pg_flush_data() on the files * it created. Now we need to fsync those files, because a checkpoint * won't do it for us while we're in recovery. We do this in a * separate pass to allow the kernel to perform all the flushes * (especially the metadata ones) at once. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; char oidbuf[OIDCHARS + 1]; char mainpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* Construct main fork pathname. */ memcpy(oidbuf, de->d_name, oidchars); oidbuf[oidchars] = '\0'; snprintf(mainpath, sizeof(mainpath), "%s/%s%s", dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); fsync_fname(mainpath, false); } FreeDir(dbspace_dir); /* * Lastly, fsync the database directory itself, ensuring the * filesystem remembers the file creations and deletions we've done. * We don't bother with this during a call that does only * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step * too at the next startup attempt. */ fsync_fname(dbspacedirname, true); } }