/* * Flush dirty pages to disk during checkpoint or database shutdown */ void SimpleLruFlush(SlruCtl ctl, bool checkpoint) { SlruShared shared = ctl->shared; SlruFlushData fdata; int slotno; int pageno = 0; int i; bool ok; /* * Find and write dirty pages */ fdata.num_files = 0; LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { SimpleLruWritePage(ctl, slotno, &fdata); /* * When called during a checkpoint, we cannot assert that the slot is * clean now, since another process might have re-dirtied it already. * That's okay. */ Assert(checkpoint || shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); } LWLockRelease(shared->ControlLock); /* * Now fsync and close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { if (ctl->do_fsync && MirroredFlatFile_Flush( &fdata.mirroredOpens[i], /* suppressError */ true)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } // UNDONE: We don't have a suppressError for close... MirroredFlatFile_Close(&fdata.mirroredOpens[i]); } if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * write out the PG_VERSION file in the specified directory. If mirror is true, * mirror the file creation to our segment mirror. * * XXX: API is terrible, make it cleaner */ void set_short_version(const char *path, DbDirNode *dbDirNode, bool mirror) { char *short_version; bool gotdot = false; int end; char *fullname; FILE *version_file; /* Construct short version string (should match initdb.c) */ short_version = pstrdup(PG_VERSION); for (end = 0; short_version[end] != '\0'; end++) { if (short_version[end] == '.') { Assert(end != 0); if (gotdot) break; else gotdot = true; } else if (short_version[end] < '0' || short_version[end] > '9') { /* gone past digits and dots */ break; } } Assert(end > 0 && short_version[end - 1] != '.' && gotdot); short_version[end++] = '\n'; short_version[end] = '\0'; if (mirror) { MirroredFlatFileOpen mirroredOpen; Insist(!PointerIsValid(path)); Insist(PointerIsValid(dbDirNode)); MirroredFlatFile_OpenInDbDir(&mirroredOpen, dbDirNode, "PG_VERSION", O_CREAT | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR, /* suppressError */ false); MirroredFlatFile_Append(&mirroredOpen, short_version, end, /* suppressError */ false); MirroredFlatFile_Flush(&mirroredOpen, /* suppressError */ false); MirroredFlatFile_Close(&mirroredOpen); } else { Insist(!PointerIsValid(dbDirNode)); Insist(PointerIsValid(path)); /* Now write the file */ fullname = palloc(strlen(path) + 11 + 1); sprintf(fullname, "%s/PG_VERSION", path); version_file = AllocateFile(fullname, PG_BINARY_W); if (version_file == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", fullname))); fprintf(version_file, "%s", short_version); if (FreeFile(version_file)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", fullname))); pfree(fullname); } pfree(short_version); }
/* * write_database_file: update the flat database file * * A side effect is to determine the oldest database's datfrozenxid * so we can set or update the XID wrap limit. * * Also, if "startup" is true, we tell relcache.c to clear out the relcache * init file in each database. That's a bit nonmodular, but scanning * pg_database twice during system startup seems too high a price for keeping * things better separated. */ static void write_database_file(Relation drel, bool startup) { StringInfoData buffer; HeapScanDesc scan; HeapTuple tuple; NameData oldest_datname; TransactionId oldest_datfrozenxid = InvalidTransactionId; MirroredFlatFileOpen mirroredOpen; initStringInfo(&buffer); MirroredFlatFile_Open( &mirroredOpen, "global", "pg_database", O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR, /* suppressError */ false, /* atomic operation */ true, /*isMirrorRecovery */ false); /* * Read pg_database and write the file. */ scan = heap_beginscan(drel, SnapshotNow, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple); char *datname; Oid datoid; Oid dattablespace; TransactionId datfrozenxid; datname = NameStr(dbform->datname); datoid = HeapTupleGetOid(tuple); dattablespace = dbform->dattablespace; datfrozenxid = dbform->datfrozenxid; /* * Identify the oldest datfrozenxid. This must match * the logic in vac_truncate_clog() in vacuum.c. * * MPP-20053: Skip databases that cannot be connected to in computing * the oldest database. */ if (dbform->datallowconn && TransactionIdIsNormal(datfrozenxid)) { if (oldest_datfrozenxid == InvalidTransactionId || TransactionIdPrecedes(datfrozenxid, oldest_datfrozenxid)) { oldest_datfrozenxid = datfrozenxid; namestrcpy(&oldest_datname, datname); } } /* * Check for illegal characters in the database name. */ if (!name_okay(datname)) { ereport(LOG, (errmsg("invalid database name \"%s\"", datname))); continue; } /* * The file format is: "dbname" oid tablespace frozenxid * * The xids are not needed for backend startup, but are of use to * autovacuum, and might also be helpful for forensic purposes. */ sputs_quote(&buffer, datname); appendStringInfo(&buffer, " %u %u %u\n", datoid, dattablespace, datfrozenxid); /* * MPP-10111 - During database expansion we need to be able to bring a * database up in order to correct the filespace locations in the * catalog. At this point we will not be able to resolve database paths * for databases not stored in "pg_default" or "pg_global". * * This is solved by passing a special guc to the startup during this * phase of expand to bypass logic involving non-system tablespaces. * Since we are bypassing the clearing of the relation cache on these * databases we need to ensure that we don't try to use them at all * elsewhere. This is done with a similar check in * PersistentTablespace_GetPrimaryAndMirrorFilespaces(). */ if (gp_before_filespace_setup && !IsBuiltinTablespace(dattablespace)) continue; } heap_endscan(scan); MirroredFlatFile_Append(&mirroredOpen, buffer.data, buffer.len, /* suppressError */ false); MirroredFlatFile_Flush(&mirroredOpen, /* suppressError */ false); MirroredFlatFile_Close(&mirroredOpen); if (buffer.maxlen > 0) pfree(buffer.data); /* * Set the transaction ID wrap limit using the oldest datfrozenxid */ if (oldest_datfrozenxid != InvalidTransactionId) SetTransactionIdLimit(oldest_datfrozenxid, &oldest_datname); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char simpleFileName[MAXPGPATH]; MirroredFlatFileOpen *existingMirroredOpen = NULL; MirroredFlatFileOpen newMirroredOpen = MirroredFlatFileOpen_Init; MirroredFlatFileOpen *useMirroredOpen = NULL; /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { existingMirroredOpen = &fdata->mirroredOpens[i]; break; } } } if (existingMirroredOpen == NULL || !MirroredFlatFile_IsActive(existingMirroredOpen)) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruSimpleFileName(simpleFileName, segno); if (MirroredFlatFile_Open( &newMirroredOpen, ctl->Dir, simpleFileName, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR, /* suppressError */ true, /* atomic operation */ false, /*isMirrorRecovery */ false)) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->mirroredOpens[fdata->num_files] = newMirroredOpen; useMirroredOpen = &fdata->mirroredOpens[fdata->num_files]; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; useMirroredOpen = &newMirroredOpen; } } else useMirroredOpen = &newMirroredOpen; } else useMirroredOpen = existingMirroredOpen; Assert(useMirroredOpen != NULL); if (MirroredFlatFile_SeekSet( useMirroredOpen, offset) != offset) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) MirroredFlatFile_Close(useMirroredOpen); return false; } if (MirroredFlatFile_Write( useMirroredOpen, offset, shared->page_buffer[slotno], BLCKSZ, /* suppressError */ true)) { slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) MirroredFlatFile_Close(useMirroredOpen); return false; } /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && MirroredFlatFile_Flush( useMirroredOpen, /* suppressError */ true)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; MirroredFlatFile_Close(useMirroredOpen); return false; } // UNDONE: We don't have a suppressError for close... MirroredFlatFile_Close(useMirroredOpen); } return true; }
/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * * NOTE: only one write attempt is made here. Hence, it is possible that * the page is still dirty at exit (if someone else re-dirtied it during * the write). However, we *do* attempt a fresh write even if the page * is already being written; this is for checkpoints. * * Control lock must be held at entry, and will be held at exit. */ void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; bool ok; /* If a write is in progress, wait for it to finish */ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && shared->page_number[slotno] == pageno) { SimpleLruWaitIO(ctl, slotno); } /* * Do nothing if page is not dirty, or if buffer no longer contains the * same page we were called for. */ if (!shared->page_dirty[slotno] || shared->page_status[slotno] != SLRU_PAGE_VALID || shared->page_number[slotno] != pageno) return; /* * Mark the slot write-busy, and clear the dirtybit. After this point, a * transaction status update on this page will mark it dirty again. */ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); /* If we failed, and we're in a flush, better close the files */ if (!ok && fdata) { int i; for (i = 0; i < fdata->num_files; i++) MirroredFlatFile_Close(&fdata->mirroredOpens[i]); } /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); /* If we failed to write, mark the page dirty again */ if (!ok) shared->page_dirty[slotno] = true; shared->page_status[slotno] = SLRU_PAGE_VALID; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }