/* * fsync a file * * Try to fsync directories but ignore errors that indicate the OS * just doesn't allow/require fsyncing directories. */ static void fsync_fname(char *fname, bool isdir) { int fd; int returncode; /* * Some OSs require directories to be opened read-only whereas other * systems don't allow us to fsync files opened read-only; so we need both * cases here */ if (!isdir) fd = OpenTransientFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); else fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); /* * Some OSs don't allow us to open directories at all (Windows returns * EACCES) */ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) return; else if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); returncode = pg_fsync(fd); /* Some OSs don't allow us to fsync directories at all */ if (returncode != 0 && isdir && errno == EBADF) { CloseTransientFile(fd); return; } if (returncode != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); CloseTransientFile(fd); }
static Oid lo_import_internal(text *filename, Oid lobjOid) { int fd; int nbytes, tmp PG_USED_FOR_ASSERTS_ONLY; char buf[BUFSIZE]; char fnamebuf[MAXPGPATH]; LargeObjectDesc *lobj; Oid oid; #ifndef ALLOW_DANGEROUS_LO_FUNCTIONS if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use server-side lo_import()"), errhint("Anyone can use the client-side lo_import() provided by libpq."))); #endif CreateFSContext(); /* * open the file to be read in */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); fd = OpenTransientFile(fnamebuf, O_RDONLY | PG_BINARY, S_IRWXU); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open server file \"%s\": %m", fnamebuf))); /* * create an inversion object */ oid = inv_create(lobjOid); /* * read in from the filesystem and write to the inversion object */ lobj = inv_open(oid, INV_WRITE, fscxt); while ((nbytes = read(fd, buf, BUFSIZE)) > 0) { tmp = inv_write(lobj, buf, nbytes); Assert(tmp == nbytes); } if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read server file \"%s\": %m", fnamebuf))); inv_close(lobj); CloseTransientFile(fd); return oid; }
/* * Flush dirty pages to disk during checkpoint or database shutdown */ void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) { SlruShared shared = ctl->shared; SlruFlushData fdata; int slotno; int pageno = 0; int i; bool ok; /* * Find and write dirty pages */ fdata.num_files = 0; LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { SlruInternalWritePage(ctl, slotno, &fdata); /* * In some places (e.g. checkpoints), we cannot assert that the slot * is clean now, since another process might have re-dirtied it * already. That's okay. */ Assert(allow_redirtied || shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); } LWLockRelease(shared->ControlLock); /* * Now fsync and close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { if (ctl->do_fsync && pg_fsync(fdata.fd[i])) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } if (CloseTransientFile(fdata.fd[i])) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } } if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
static Oid lo_import_internal(text *filename, Oid lobjOid) { int fd; int nbytes, tmp PG_USED_FOR_ASSERTS_ONLY; char buf[BUFSIZE]; char fnamebuf[MAXPGPATH]; LargeObjectDesc *lobj; Oid oid; CreateFSContext(); /* * open the file to be read in */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); fd = OpenTransientFile(fnamebuf, O_RDONLY | PG_BINARY); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open server file \"%s\": %m", fnamebuf))); /* * create an inversion object */ oid = inv_create(lobjOid); /* * read in from the filesystem and write to the inversion object */ lobj = inv_open(oid, INV_WRITE, fscxt); while ((nbytes = read(fd, buf, BUFSIZE)) > 0) { tmp = inv_write(lobj, buf, nbytes); Assert(tmp == nbytes); } if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read server file \"%s\": %m", fnamebuf))); inv_close(lobj); CloseTransientFile(fd); return oid; }
/* * Return whether the given page exists on disk. * * A false return means that either the file does not exist, or that it's not * large enough to contain the given page. */ bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) { int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; bool result; off_t endpos; SlruFileName(ctl, path, segno); fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { /* expected: file doesn't exist */ if (errno == ENOENT) return false; /* report error normally */ slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; SlruReportIOError(ctl, pageno, 0); } if ((endpos = lseek(fd, 0, SEEK_END)) < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; SlruReportIOError(ctl, pageno, 0); } result = endpos >= (off_t) (offset + BLCKSZ); CloseTransientFile(fd); return result; }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC); if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } pgstat_report_wait_end(); /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); pgstat_report_wait_end(); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); pgstat_report_wait_end(); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active_pid = 0; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ SpinLockAcquire(&slot->mutex); was_dirty = slot->dirty; slot->just_dirtied = false; SpinLockRelease(&slot->mutex); /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(&slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32C(cp.checksum); cp.version = SLOT_VERSION; cp.length = ReplicationSlotOnDiskV2Size; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32C(cp.checksum, (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); /* fsync the temporary file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_SYNC); if (pg_fsync(fd) != 0) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ SpinLockAcquire(&slot->mutex); if (!slot->just_dirtied) slot->dirty = false; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (max_lsn < this_lsn) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) CloseTransientFile(fd); return false; } errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) CloseTransientFile(fd); return false; } pgstat_report_wait_end(); /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); if (ctl->do_fsync && pg_fsync(fd)) { pgstat_report_wait_end(); slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } pgstat_report_wait_end(); if (CloseTransientFile(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * Physical read of a (previously existing) page into a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * read/write operations. We could cache one virtual file pointer ... */ static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; SlruFileName(ctl, path, segno); /* * In a crash-and-restart situation, it's possible for us to receive * commands to set the commit status of transactions whose bits are in * already-truncated segments of the commit log (see notes in * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case * where the file doesn't exist, and return zeroes instead. */ fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { if (errno != ENOENT || !InRecovery) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } ereport(LOG, (errmsg("file \"%s\" doesn't exist, reading as zeroes", path))); MemSet(shared->page_buffer[slotno], 0, BLCKSZ); return true; } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } pgstat_report_wait_end(); if (CloseTransientFile(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } return true; }
/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * * NOTE: only one write attempt is made here. Hence, it is possible that * the page is still dirty at exit (if someone else re-dirtied it during * the write). However, we *do* attempt a fresh write even if the page * is already being written; this is for checkpoints. * * Control lock must be held at entry, and will be held at exit. */ static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; bool ok; /* If a write is in progress, wait for it to finish */ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && shared->page_number[slotno] == pageno) { SimpleLruWaitIO(ctl, slotno); } /* * Do nothing if page is not dirty, or if buffer no longer contains the * same page we were called for. */ if (!shared->page_dirty[slotno] || shared->page_status[slotno] != SLRU_PAGE_VALID || shared->page_number[slotno] != pageno) return; /* * Mark the slot write-busy, and clear the dirtybit. After this point, a * transaction status update on this page will mark it dirty again. */ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); /* If we failed, and we're in a flush, better close the files */ if (!ok && fdata) { int i; for (i = 0; i < fdata->num_files; i++) CloseTransientFile(fdata->fd[i]); } /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); /* If we failed to write, mark the page dirty again */ if (!ok) shared->page_dirty[slotno] = true; shared->page_status[slotno] = SLRU_PAGE_VALID; LWLockRelease(&shared->buffer_locks[slotno].lock); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char slotdir[MAXPGPATH + 12]; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(slotdir, "pg_replslot/%s", name); sprintf(path, "%s/state.tmp", slotdir); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "%s/state", slotdir); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(slotdir, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { if (!rmtree(slotdir, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", slotdir))); } fsync_fname("pg_replslot", true); return; } /* * Verify that requirements for the specific slot type are met. That's * important because if these aren't met we're not guaranteed to retain * all the necessary resources for the slot. * * NB: We have to do so *after* the above checks for ephemeral slots, * because otherwise a slot that shouldn't exist anymore could prevent * restarts. * * NB: Changing the requirements here also requires adapting * CheckSlotRequirements() and CheckLogicalDecodingRequirements(). */ if (cp.slotdata.database != InvalidOid && wal_level < WAL_LEVEL_LOGICAL) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("logical replication slot \"%s\" exists, but wal_level < logical", NameStr(cp.slotdata.name)), errhint("Change wal_level to be logical or higher."))); else if (wal_level < WAL_LEVEL_ARCHIVE) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("physical replication slot \"%s\" exists, but wal_level < archive", NameStr(cp.slotdata.name)), errhint("Change wal_level to be archive or higher."))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * copy one file */ void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (offset = 0;; offset += nbytes) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } /* * We fsync the files later but first flush them to avoid spamming the * cache and hopefully get the kernel to start writing them out before * the fsync comes. Ignore any error, since it's only a hint. */ (void) pg_flush_data(dstfd, offset, nbytes); } if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); CloseTransientFile(srcfd); pfree(buffer); }
/* * lo_export - * exports an (inversion) large object. */ Datum lo_export(PG_FUNCTION_ARGS) { Oid lobjId = PG_GETARG_OID(0); text *filename = PG_GETARG_TEXT_PP(1); int fd; int nbytes, tmp; char buf[BUFSIZE]; char fnamebuf[MAXPGPATH]; LargeObjectDesc *lobj; mode_t oumask; #ifndef ALLOW_DANGEROUS_LO_FUNCTIONS if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use server-side lo_export()"), errhint("Anyone can use the client-side lo_export() provided by libpq."))); #endif CreateFSContext(); /* * open the inversion object (no need to test for failure) */ lobj = inv_open(lobjId, INV_READ, fscxt); /* * open the file to be written to * * Note: we reduce backend's normal 077 umask to the slightly friendlier * 022. This code used to drop it all the way to 0, but creating * world-writable export files doesn't seem wise. */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); oumask = umask(S_IWGRP | S_IWOTH); fd = OpenTransientFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); umask(oumask); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create server file \"%s\": %m", fnamebuf))); /* * read in from the inversion file and write to the filesystem */ while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0) { tmp = write(fd, buf, nbytes); if (tmp != nbytes) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write server file \"%s\": %m", fnamebuf))); } CloseTransientFile(fd); inv_close(lobj); PG_RETURN_INT32(1); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not unlink file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskDynamicSize) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *)&cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC32 */ INIT_CRC32(checksum); COMP_CRC32(checksum, (char *)&cp + ReplicationSlotOnDiskConstantSize, ReplicationSlotOnDiskDynamicSize); if (!EQ_CRC32(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * lo_export - * exports an (inversion) large object. */ Datum be_lo_export(PG_FUNCTION_ARGS) { Oid lobjId = PG_GETARG_OID(0); text *filename = PG_GETARG_TEXT_PP(1); int fd; int nbytes, tmp; char buf[BUFSIZE]; char fnamebuf[MAXPGPATH]; LargeObjectDesc *lobj; mode_t oumask; CreateFSContext(); /* * open the inversion object (no need to test for failure) */ lobj = inv_open(lobjId, INV_READ, fscxt); /* * open the file to be written to * * Note: we reduce backend's normal 077 umask to the slightly friendlier * 022. This code used to drop it all the way to 0, but creating * world-writable export files doesn't seem wise. */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); oumask = umask(S_IWGRP | S_IWOTH); PG_TRY(); { fd = OpenTransientFilePerm(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); } PG_CATCH(); { umask(oumask); PG_RE_THROW(); } PG_END_TRY(); umask(oumask); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create server file \"%s\": %m", fnamebuf))); /* * read in from the inversion file and write to the filesystem */ while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0) { tmp = write(fd, buf, nbytes); if (tmp != nbytes) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write server file \"%s\": %m", fnamebuf))); } CloseTransientFile(fd); inv_close(lobj); PG_RETURN_INT32(1); }
/* * load_relmap_file -- load data from the shared or local map file * * Because the map file is essential for access to core system catalogs, * failure to read it is a fatal error. * * Note that the local case requires DatabasePath to be set up. */ static void load_relmap_file(bool shared) { RelMapFile *map; char mapfilename[MAXPGPATH]; pg_crc32 crc; int fd; if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); map = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", DatabasePath, RELMAPPER_FILENAME); map = &local_map; } /* Read data ... */ fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); /* * Note: we could take RelationMappingLock in shared mode here, but it * seems unnecessary since our read() should be atomic against any * concurrent updater's write(). If the file is updated shortly after we * look, the sinval signaling mechanism will make us re-read it before we * are able to access any relation that's affected by the change. */ if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile)) ereport(FATAL, (errcode_for_file_access(), errmsg("could not read relation mapping file \"%s\": %m", mapfilename))); CloseTransientFile(fd); /* check for correct magic number, etc */ if (map->magic != RELMAPPER_FILEMAGIC || map->num_mappings < 0 || map->num_mappings > MAX_MAPPINGS) ereport(FATAL, (errmsg("relation mapping file \"%s\" contains invalid data", mapfilename))); /* verify the CRC */ INIT_CRC32(crc); COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc)); FIN_CRC32(crc); if (!EQ_CRC32(crc, map->crc)) ereport(FATAL, (errmsg("relation mapping file \"%s\" contains incorrect checksum", mapfilename))); }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32(newmap->crc); COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecData rdata[2]; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfRelmapUpdate; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) newmap; rdata[1].len = sizeof(RelMapFile); rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode, false); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * copy one file */ void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; off_t offset; off_t flush_offset; /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) /* * Size of data flush requests. It seems beneficial on most platforms to * do this every 1MB or so. But macOS, at least with early releases of * APFS, is really unfriendly to small mmap/msync requests, so there do it * only every 32MB. */ #if defined(__darwin__) #define FLUSH_DISTANCE (32 * 1024 * 1024) #else #define FLUSH_DISTANCE (1024 * 1024) #endif /* Use palloc to ensure we get a maxaligned buffer */ buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ flush_offset = 0; for (offset = 0;; offset += nbytes) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); /* * We fsync the files later, but during the copy, flush them every so * often to avoid spamming the cache and hopefully get the kernel to * start writing them out before the fsync comes. */ if (offset - flush_offset >= FLUSH_DISTANCE) { pg_flush_data(dstfd, flush_offset, offset - flush_offset); flush_offset = offset; } nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } } if (offset > flush_offset) pg_flush_data(dstfd, flush_offset, offset - flush_offset); if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); CloseTransientFile(srcfd); pfree(buffer); }