/* * Flush dirty pages to disk during checkpoint or database shutdown */ void SimpleLruFlush(SlruCtl ctl, bool checkpoint) { SlruShared shared = ctl->shared; SlruFlushData fdata; int slotno; int pageno = 0; int i; bool ok; /* * Find and write dirty pages */ fdata.num_files = 0; LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { SlruInternalWritePage(ctl, slotno, &fdata); /* * When called during a checkpoint, we cannot assert that the slot is * clean now, since another process might have re-dirtied it already. * That's okay. */ Assert(checkpoint || shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); } LWLockRelease(shared->ControlLock); /* * Now fsync and close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { if (ctl->do_fsync && pg_fsync(fdata.fd[i])) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } if (close(fdata.fd[i])) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } } if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Recreates a state file. This is used in WAL replay. * * Note: content and len don't include CRC. */ void RecreateTwoPhaseFile(TransactionId xid, void *content, int len) { char path[MAXPGPATH]; pg_crc32 statefile_crc; int fd; /* Recompute CRC */ INIT_CRC32(statefile_crc); COMP_CRC32(statefile_crc, content, len); FIN_CRC32(statefile_crc); TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not recreate two-phase state file \"%s\": %m", path))); /* Write content and CRC */ if (write(fd, content, len) != len) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } /* * We must fsync the file because the end-of-replay checkpoint will not do * so, there being no GXACT in shared memory yet to tell it to. */ if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file: %m"))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); }
/** * @brief Flush and close the data file. * @param loader [in] Direct Writer. * @return void */ static void close_data_file(DirectWriter *loader) { if (loader->datafd != -1) { if (pg_fsync(loader->datafd) != 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not sync data file: %m"))); if (close(loader->datafd) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not close data file: %m"))); loader->datafd = -1; } }
/* * fsync a file * * Try to fsync directories but ignore errors that indicate the OS * just doesn't allow/require fsyncing directories. */ static void fsync_fname(char *fname, bool isdir) { int fd; int returncode; /* * Some OSs require directories to be opened read-only whereas other * systems don't allow us to fsync files opened read-only; so we need both * cases here */ if (!isdir) fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); else fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); /* * Some OSs don't allow us to open directories at all (Windows returns * EACCES) */ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) return; else if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); returncode = pg_fsync(fd); /* Some OSs don't allow us to fsync directories at all */ if (returncode != 0 && isdir && errno == EBADF) { close(fd); return; } if (returncode != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); close(fd); }
/* * syncWriteLog - sync log */ static void syncWriteLog(int fd, void *buf, int offset, int len) { int loffset = lseek(fd, offset, SEEK_SET); if (loffset != offset) { elog(ERROR, "QDSYNC: error lseek location: %d, offset: %d, filename '%s', errno: %d", loffset, offset, xlogfilename, errno); } write_with_ereport(fd, buf, len); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("QDSYNC: could not fsync file '%s': %m", xlogfilename))); }
int FileSync(File file) { int returnCode; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSync: %d (%s)", file, VfdCache[file].fileName)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; return pg_fsync(VfdCache[file].fd); }
int FileSync(File file) { int returnCode; FileRepGpmonRecord_s gpmonRecord; FileRepGpmonStatType_e whichStat; if (fileRepRole == FileRepPrimaryRole) { whichStat = FileRepGpmonStatType_PrimaryFsyncSyscall; FileRepGpmonStat_OpenRecord(whichStat, &gpmonRecord); } else { whichStat = FileRepGpmonStatType_MirrorFsyncSyscall; FileRepGpmonStat_OpenRecord(whichStat, &gpmonRecord); } Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSync: %d (%s)", file, VfdCache[file].fileName)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepFlush, DDLNotSpecified, "", //databaseName ""); // tableName #endif returnCode = pg_fsync(VfdCache[file].fd); if (returnCode >= 0) { //only include stats if successful if ((fileRepRole == FileRepPrimaryRole) || (fileRepRole == FileRepMirrorRole)) { FileRepGpmonStat_CloseRecord(whichStat, &gpmonRecord); } } return returnCode; }
/** * @brief Update load status file. * @param loader [in/out] Load status information * @param num [in] the number of blocks already written * @return void */ static void UpdateLSF(DirectWriter *loader, BlockNumber num) { int ret; LoadStatus *ls = &loader->ls; ls->ls.create_cnt += num; lseek(loader->lsf_fd, 0, SEEK_SET); ret = write(loader->lsf_fd, ls, sizeof(LoadStatus)); if (ret != sizeof(LoadStatus)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to \"%s\": %m", loader->lsf_path))); if (pg_fsync(loader->lsf_fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", loader->lsf_path))); }
/* Flushes, syncs, and closes the given file pointer and checks for errors. */ static void SyncAndCloseFile(FILE *file) { int flushResult = 0; int syncResult = 0; int errorResult = 0; int freeResult = 0; errno = 0; flushResult = fflush(file); if (flushResult != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not flush file: %m"))); } syncResult = pg_fsync(fileno(file)); if (syncResult != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not sync file: %m"))); } errorResult = ferror(file); if (errorResult != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("error in file: %m"))); } freeResult = FreeFile(file); if (freeResult != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file: %m"))); } }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32(newmap->crc); COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecData rdata[2]; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfRelmapUpdate; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) newmap; rdata[1].len = sizeof(RelMapFile); rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode, false); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char slotdir[MAXPGPATH + 12]; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(slotdir, "pg_replslot/%s", name); sprintf(path, "%s/state.tmp", slotdir); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "%s/state", slotdir); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(slotdir, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { if (!rmtree(slotdir, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", slotdir))); } fsync_fname("pg_replslot", true); return; } /* * Verify that requirements for the specific slot type are met. That's * important because if these aren't met we're not guaranteed to retain * all the necessary resources for the slot. * * NB: We have to do so *after* the above checks for ephemeral slots, * because otherwise a slot that shouldn't exist anymore could prevent * restarts. * * NB: Changing the requirements here also requires adapting * CheckSlotRequirements() and CheckLogicalDecodingRequirements(). */ if (cp.slotdata.database != InvalidOid && wal_level < WAL_LEVEL_LOGICAL) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("logical replication slot \"%s\" exists, but wal_level < logical", NameStr(cp.slotdata.name)), errhint("Change wal_level to be logical or higher."))); else if (wal_level < WAL_LEVEL_ARCHIVE) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("physical replication slot \"%s\" exists, but wal_level < archive", NameStr(cp.slotdata.name)), errhint("Change wal_level to be archive or higher."))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ SpinLockAcquire(&slot->mutex); was_dirty = slot->dirty; slot->just_dirtied = false; SpinLockRelease(&slot->mutex); /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(&slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32C(cp.checksum); cp.version = SLOT_VERSION; cp.length = ReplicationSlotOnDiskV2Size; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32C(cp.checksum, (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); /* fsync the temporary file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_SYNC); if (pg_fsync(fd) != 0) { int save_errno = errno; pgstat_report_wait_end(); CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } pgstat_report_wait_end(); CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ SpinLockAcquire(&slot->mutex); if (!slot->just_dirtied) slot->dirty = false; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC); if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } pgstat_report_wait_end(); /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); pgstat_report_wait_end(); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); pgstat_report_wait_end(); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active_pid = 0; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing * this code simultaneously for different pages of the same file. * Hence, don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * Flush dirty pages to disk during checkpoint or database shutdown */ void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) { SlruShared shared = ctl->shared; SlruFlushData fdata; int slotno; int pageno = 0; int i; bool ok; /* * Find and write dirty pages */ fdata.num_files = 0; LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { SlruInternalWritePage(ctl, slotno, &fdata); /* * In some places (e.g. checkpoints), we cannot assert that the slot * is clean now, since another process might have re-dirtied it * already. That's okay. */ Assert(allow_redirtied || shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); } LWLockRelease(shared->ControlLock); /* * Now fsync and close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); if (ctl->do_fsync && pg_fsync(fdata.fd[i])) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } pgstat_report_wait_end(); if (CloseTransientFile(fdata.fd[i])) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } } if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Most of this procedure is from XLogFileInit. */ static void createZeroFilledNewFile(char *path) { char tmppath[MAXPGPATH]; int fd; char zbuffer[XLOG_BLCKSZ]; int nbytes; char *xlogDir = NULL; /* * Initialize an empty (all zeroes) segment. */ xlogDir = makeRelativeToTxnFilespace(XLOGDIR); if (snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", xlogDir, (int) getpid()) > MAXPGPATH) { ereport(ERROR, (errmsg("cannot generate dir path %s/xlogtemp.%d", xlogDir, (int) getpid()))); } pfree(xlogDir); unlink(tmppath); /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */ fd = open(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); /* * Zero-fill the file. We have to do this the hard way to ensure that all * the file space has really been allocated --- on platforms that allow * "holes" in files, just seeking to the end doesn't allocate intermediate * space. This way, we know that we have all the space and (after the * fsync below) that all the indirect blocks are down on disk. Therefore, * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the * log file. */ MemSet(zbuffer, 0, sizeof(zbuffer)); for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer)) { errno = 0; if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer)) { int save_errno = errno; /* * If we fail to make the file, delete it to release disk space */ unlink(tmppath); /* if write didn't set errno, assume problem is no disk space */ errno = save_errno ? save_errno : ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } } if (pg_fsync(fd) != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); } if (close(fd)) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath))); } if (rename(tmppath, path) < 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m", tmppath, path, xlogid, xseg))); } /* * Re-open with different open flags. */ xlogfilefd = open(path, O_RDWR, 0); if (xlogfilefd < 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("QDSYNC: could not create xlog file \"%s\"", path))); } elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: created zero-filled xlog file %s", xlogfilename); }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not unlink file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskDynamicSize) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *)&cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC32 */ INIT_CRC32(checksum); COMP_CRC32(checksum, (char *)&cp + ReplicationSlotOnDiskConstantSize, ReplicationSlotOnDiskDynamicSize); if (!EQ_CRC32(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; struct timeval tv; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (XLByteLT(max_lsn, this_lsn)) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } #ifdef XP_TRACE_LRU_WRITE gettimeofday(&tv, NULL); ereport(TRACE_LEVEL, (errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s", tv.tv_sec, tv.tv_usec, path))); #endif /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * copy one file */ static void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (;;) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } } /* * Be paranoid here to ensure we catch problems. */ if (pg_fsync(dstfd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tofile))); if (close(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); close(srcfd); pfree(buffer); }
/* * Perform garbage collection (if required) of file * @param map_path path to file map file (*.map). */ static bool cfs_gc_file(char* map_path) { int md = open(map_path, O_RDWR|PG_BINARY, 0); FileMap* map; uint32 physSize; uint32 usedSize; uint32 virtSize; int suf = strlen(map_path)-4; int fd = -1, fd2 = -1, md2 = -1; bool succeed = true; if (md < 0) { elog(LOG, "Failed to open map file %s: %m", map_path); return false; } map = cfs_mmap(md); if (map == MAP_FAILED) { elog(LOG, "Failed to map file %s: %m", map_path); close(md); return false; } usedSize = pg_atomic_read_u32(&map->usedSize); physSize = pg_atomic_read_u32(&map->physSize); virtSize = pg_atomic_read_u32(&map->virtSize); if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */ { long delay = CFS_LOCK_MIN_TIMEOUT; char* file_path = (char*)palloc(suf+1); char* map_bck_path = (char*)palloc(suf+10); char* file_bck_path = (char*)palloc(suf+5); FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap)); uint32 newSize = 0; inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*)); bool remove_backups = true; int n_pages = virtSize / BLCKSZ; TimestampTz startTime, endTime; long secs; int usecs; int i; startTime = GetCurrentTimestamp(); memcpy(file_path, map_path, suf); file_path[suf] = '\0'; strcat(strcpy(map_bck_path, map_path), ".bck"); strcat(strcpy(file_bck_path, file_path), ".bck"); while (true) { uint32 access_count = 0; if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) { break; } if (access_count >= CFS_GC_LOCK) { /* Uhhh... looks like last GC was interrupted. * Try to recover file */ if (access(file_bck_path, R_OK) != 0) { /* There is no backup file: new map should be constructed */ md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0); if (md2 >= 0) { /* Recover map */ if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { elog(LOG, "Failed to read file %s: %m", map_bck_path); goto Cleanup; } close(md2); md2 = -1; newSize = pg_atomic_read_u32(&newMap->usedSize); remove_backups = false; goto ReplaceMap; } } else { /* Presence of backup file means that we still have unchanged data and map files. * Just remove backup files, grab lock and continue processing */ unlink(file_bck_path); unlink(map_bck_path); break; } } pg_usleep(delay); if (delay < CFS_LOCK_MAX_TIMEOUT) { delay *= 2; } } md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (md2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { newMap->inodes[i] = map->inodes[i]; inodes[i] = &newMap->inodes[i]; } /* sort inodes by offset to improve read locality */ qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs); fd = open(file_path, O_RDWR|PG_BINARY, 0); if (fd < 0) { goto Cleanup; } fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (fd2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { int size = CFS_INODE_SIZE(*inodes[i]); if (size != 0) { char block[BLCKSZ]; off_t rc PG_USED_FOR_ASSERTS_ONLY; uint32 offs = CFS_INODE_OFFS(*inodes[i]); Assert(size <= BLCKSZ); rc = lseek(fd, offs, SEEK_SET); Assert(rc == offs); if (!cfs_read_file(fd, block, size)) { elog(LOG, "Failed to read file %s: %m", file_path); goto Cleanup; } if (!cfs_write_file(fd2, block, size)) { elog(LOG, "Failed to write file %s: %m", file_bck_path); goto Cleanup; } offs = newSize; newSize += size; *inodes[i] = CFS_INODE(size, offs); } } pg_atomic_write_u32(&map->usedSize, newSize); if (close(fd) < 0) { elog(LOG, "Failed to close file %s: %m", file_path); goto Cleanup; } fd = -1; /* Persist copy of data file */ if (pg_fsync(fd2) < 0) { elog(LOG, "Failed to sync file %s: %m", file_bck_path); goto Cleanup; } if (close(fd2) < 0) { elog(LOG, "Failed to close file %s: %m", file_bck_path); goto Cleanup; } fd2 = -1; /* Persist copy of map file */ if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { elog(LOG, "Failed to write file %s: %m", map_bck_path); goto Cleanup; } if (pg_fsync(md2) < 0) { elog(LOG, "Failed to sync file %s: %m", map_bck_path); goto Cleanup; } if (close(md2) < 0) { elog(LOG, "Failed to close file %s: %m", map_bck_path); goto Cleanup; } md2 = -1; /* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); goto Cleanup; } /* * Now all information necessary for recovery is stored. * We are ready to replace existed file with defragmented one. * Use rename and rely on file system to provide atomicity of this operation. */ remove_backups = false; if (rename(file_bck_path, file_path) < 0) { elog(LOG, "Failed to rename file %s: %m", file_path); goto Cleanup; } ReplaceMap: /* At this moment defragmented file version is stored. We can perfrom in-place update of map. * If crash happens at this point, map can be recovered from backup file */ memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t)); pg_atomic_write_u32(&map->usedSize, newSize); pg_atomic_write_u32(&map->physSize, newSize); map->generation += 1; /* force all backends to reopen the file */ /* Before removing backup files and releasing locks we need to flush updated map file */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); Cleanup: if (fd >= 0) close(fd); if (fd2 >= 0) close(fd2); if (md2 >= 0) close(md2); if (remove_backups) { unlink(file_bck_path); unlink(map_bck_path); remove_backups = false; } succeed = false; } else { remove_backups = true; /* now backups are not need any more */ } pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */ /* remove map backup file */ if (remove_backups && unlink(map_bck_path)) { elog(LOG, "Failed to unlink file %s: %m", map_bck_path); succeed = false; } endTime = GetCurrentTimestamp(); TimestampDifference(startTime, endTime, &secs, &usecs); elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec", MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize, secs*USECS_PER_SEC + usecs); pfree(file_path); pfree(file_bck_path); pfree(map_bck_path); pfree(inodes); pfree(newMap); if (cfs_gc_delay != 0) { int rc = WaitLatch(MyLatch, WL_TIMEOUT | WL_POSTMASTER_DEATH, cfs_gc_delay /* ms */ ); if (rc & WL_POSTMASTER_DEATH) { exit(1); } } } else if (cfs_state->max_iterations == 1) { elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f", MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize); } if (cfs_munmap(map) < 0) { elog(LOG, "Failed to unmap file %s: %m", map_path); succeed = false; } if (close(md) < 0) { elog(LOG, "Failed to close file %s: %m", map_path); succeed = false; } return succeed; }
/* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * * We must fsync the state file of any GXACT that is valid and has a PREPARE * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or * has a later LSN, this checkpoint is not responsible for fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite * possible that GXACTs that were valid at checkpoint start will no longer * exist if we wait a little bit. * * If a GXACT remains valid across multiple checkpoints, it'll be fsynced * each time. This is considered unusual enough that we don't bother to * expend any extra code to avoid the redundant fsyncs. (They should be * reasonably cheap anyway, since they won't cause I/O.) */ void CheckPointTwoPhase(XLogRecPtr redo_horizon) { TransactionId *xids; int nxids; char path[MAXPGPATH]; int i; /* * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab * it just long enough to make a list of the XIDs that require fsyncing, * and then do the I/O afterwards. * * This approach creates a race condition: someone else could delete a * GXACT between the time we release TwoPhaseStateLock and the time we try * to open its state file. We handle this by special-casing ENOENT * failures: if we see that, we verify that the GXACT is no longer valid, * and if so ignore the failure. */ if (max_prepared_xacts <= 0) return; /* nothing to do */ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); nxids = 0; LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if (gxact->valid && XLByteLE(gxact->prepare_lsn, redo_horizon)) xids[nxids++] = gxact->proc.xid; } LWLockRelease(TwoPhaseStateLock); for (i = 0; i < nxids; i++) { TransactionId xid = xids[i]; int fd; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) { /* OK if gxact is no longer valid */ if (!TransactionIdIsPrepared(xid)) continue; /* Restore errno in case it was changed */ errno = ENOENT; } ereport(ERROR, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); } if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", path))); } pfree(xids); TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); }
/** * @brief Initialize a DirectWriter */ static void DirectWriterInit(DirectWriter *self) { LoadStatus *ls; /* * Set defaults to unspecified parameters. */ if (self->base.max_dup_errors < -1) self->base.max_dup_errors = DEFAULT_MAX_DUP_ERRORS; self->base.rel = heap_open(self->base.relid, AccessExclusiveLock); VerifyTarget(self->base.rel, self->base.max_dup_errors); self->base.desc = RelationGetDescr(self->base.rel); SpoolerOpen(&self->spooler, self->base.rel, false, self->base.on_duplicate, self->base.max_dup_errors, self->base.dup_badfile); self->base.context = GetPerTupleMemoryContext(self->spooler.estate); /* Verify DataDir/pg_bulkload directory */ ValidateLSFDirectory(BULKLOAD_LSF_DIR); /* Initialize first block */ PageInit(GetCurrentPage(self), BLCKSZ, 0); PageSetTLI(GetCurrentPage(self), ThisTimeLineID); /* Obtain transaction ID and command ID. */ self->xid = GetCurrentTransactionId(); self->cid = GetCurrentCommandId(true); /* * Initialize load status information */ ls = &self->ls; ls->ls.relid = self->base.relid; ls->ls.rnode = self->base.rel->rd_node; ls->ls.exist_cnt = RelationGetNumberOfBlocks(self->base.rel); ls->ls.create_cnt = 0; /* * Create a load status file and write the initial status for it. * At the time, if we find any existing load status files, exit with * error because recovery process haven't been executed after failing * load to the same table. */ BULKLOAD_LSF_PATH(self->lsf_path, ls); self->lsf_fd = BasicOpenFile(self->lsf_path, O_CREAT | O_EXCL | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (self->lsf_fd == -1) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create loadstatus file \"%s\": %m", self->lsf_path))); if (write(self->lsf_fd, ls, sizeof(LoadStatus)) != sizeof(LoadStatus) || pg_fsync(self->lsf_fd) != 0) { UnlinkLSF(self); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write loadstatus file \"%s\": %m", self->lsf_path))); } self->base.tchecker = CreateTupleChecker(self->base.desc); self->base.tchecker->checker = (CheckerTupleProc) CoercionCheckerTuple; }