/* * Set checksum for a page in private memory. * * This must only be used when we know that no other process can be modifying * the page buffer. */ void PageSetChecksumInplace(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ if (PageIsNew(page) || !DataChecksumsEnabled()) return; ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); }
/* * Set checksum for a page in shared buffers. * * If checksums are disabled, or if the page is not initialized, just return * the input. Otherwise, we must make a copy of the page before calculating * the checksum, to prevent concurrent modifications (e.g. setting hint bits) * from making the final checksum invalid. It doesn't matter if we include or * exclude hints during the copy, as long as we write a valid page and * associated checksum. * * Returns a pointer to the block-sized data that needs to be written. Uses * statically-allocated memory, so the caller must immediately write the * returned page and not refer to it again. */ char * PageSetChecksumCopy(Page page, BlockNumber blkno) { static char *pageCopy = NULL; /* If we don't need a checksum, just return the passed-in data */ if (PageIsNew(page) || !DataChecksumsEnabled()) return (char *) page; /* * We allocate the copy space once and use it over on each subsequent * call. The point of palloc'ing here, rather than having a static char * array, is first to ensure adequate alignment for the checksumming code * and second to avoid wasting space in processes that never call this. */ if (pageCopy == NULL) pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); return pageCopy; }
/* * PageIsVerified * Check that the page header and checksum (if any) appear valid. * * This is called when a page has just been read in from disk. The idea is * to cheaply detect trashed pages before we go nuts following bogus item * pointers, testing invalid transaction identifiers, etc. * * It turns out to be necessary to allow zeroed pages here too. Even though * this routine is *not* called when deliberately adding a page to a relation, * there are scenarios in which a zeroed page might be found in a table. * (Example: a backend extends a relation, then crashes before it can write * any WAL entry about the new page. The kernel will already have the * zeroed page in the file, and it will stay that way after restart.) So we * allow zeroed pages here, and are careful that the page access macros * treat such a page as empty and without free space. Eventually, VACUUM * will clean up such a page and make it usable. */ bool PageIsVerified(Page page, BlockNumber blkno) { PageHeader p = (PageHeader) page; size_t *pagebytes; int i; bool checksum_failure = false; bool header_sane = false; bool all_zeroes = false; uint16 checksum = 0; /* * Don't verify page data unless the page passes basic non-zero test */ if (!PageIsNew(page)) { if (DataChecksumsEnabled()) { checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; } /* * The following checks don't prove the header is correct, only that * it looks sane enough to allow into the buffer pool. Later usage of * the block can still reveal problems, which is why we offer the * checksum option. */ if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && p->pd_lower <= p->pd_upper && p->pd_upper <= p->pd_special && p->pd_special <= BLCKSZ && p->pd_special == MAXALIGN(p->pd_special)) header_sane = true; if (header_sane && !checksum_failure) return true; } /* * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a * multiple of size_t - and it's much faster to compare memory using the * native word size. */ StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t), "BLCKSZ has to be a multiple of sizeof(size_t)"); all_zeroes = true; pagebytes = (size_t *) page; for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++) { if (pagebytes[i] != 0) { all_zeroes = false; break; } } if (all_zeroes) return true; /* * Throw a WARNING if the checksum fails, but only after we've checked for * the all-zeroes case. */ if (checksum_failure) { ereport(WARNING, (ERRCODE_DATA_CORRUPTED, errmsg("page verification failed, calculated checksum %u but expected %u", checksum, p->pd_checksum))); if (header_sane && ignore_checksum_failure) return true; } return false; }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling * this function. Except in recovery, caller should also pass the heap * buffer. When checksums are enabled and we're not in recovery, we must add * the heap buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); /* * If data checksums are enabled, we need to protect the heap * page from being torn. */ if (DataChecksumsEnabled()) { Page heapPage = BufferGetPage(heapBuf); /* caller is expected to set PD_ALL_VISIBLE first */ Assert(PageIsAllVisible(heapPage)); PageSetLSN(heapPage, recptr); } } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); }
/** * @brief Write block buffer contents. Number of block buffer to be * written is specified by num argument. * * Flow: * <ol> * <li>If no more space is available in the data file, switch to a new one.</li> * <li>Compute block number which can be written to the current file.</li> * <li>Save the last block number in the load status file.</li> * <li>Write to the current file.</li> * <li>If there are other data, write them too.</li> * </ol> * * @param loader [in] Direct Writer. * @return File descriptor for the current data file. */ static void flush_pages(DirectWriter *loader) { int i; int num; LoadStatus *ls = &loader->ls; num = loader->curblk; if (!PageIsEmpty(GetCurrentPage(loader))) num += 1; if (num <= 0) return; /* no work */ /* * Add WAL entry (only the first page) to ensure the current xid will * be recorded in xlog. We must flush some xlog records with XLogFlush() * before write any data blocks to follow the WAL protocol. * * If postgres process, such as loader and COPY, is killed by "kill -9", * database will be rewound to the last checkpoint and recovery will * be performed using WAL. * * After the recovery, if there are xid's which have not been recorded * to WAL, such xid's will be reused. * * However, in the loader and COPY, data file is actually updated and * xid must not be reused. * * WAL entry with such xid can be added using XLogInsert(). However, * such entries are not really written to the disk immediately. * WAL entries are flushed to the disk by XLogFlush(), typically * when a transaction is commited. COPY prevents xid reuse by * this method. */ #if PG_VERSION_NUM >= 90100 if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) && !(loader->base.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #else if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #endif /* * Write blocks. We might need to write multiple files on boundary of * relation segments. */ for (i = 0; i < num;) { char *buffer; int total; int written; int flush_num; BlockNumber relblks = LS_TOTAL_CNT(ls); /* Switch to the next file if the current file has been filled up. */ if (relblks % RELSEG_SIZE == 0) close_data_file(loader); if (loader->datafd == -1) loader->datafd = open_data_file(ls->ls.rnode, RELATION_IS_LOCAL(loader->base.rel), relblks); /* Number of blocks to be added to the current file. */ flush_num = Min(num - i, RELSEG_SIZE - relblks % RELSEG_SIZE); Assert(flush_num > 0); /* Write the last block number to the load status file. */ UpdateLSF(loader, flush_num); #if PG_VERSION_NUM >= 90300 /* If we need a checksum, add it */ if (DataChecksumsEnabled()){ int j = 0; Page contained_page; for ( j=0; j<flush_num; j++ ) { contained_page = GetTargetPage(loader,j); ((PageHeader) contained_page)->pd_checksum = pg_checksum_page((char *) contained_page, LS_TOTAL_CNT(ls) - 1 - j); } } #endif /* * Flush flush_num data block to the current file. * Then the current file size becomes RELSEG_SIZE self->blocks. */ buffer = loader->blocks + BLCKSZ * i; total = BLCKSZ * flush_num; written = 0; while (total > 0) { int len = write(loader->datafd, buffer + written, total); if (len == -1) { /* fatal error, do not want to write blocks anymore */ ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to data file: %m"))); } written += len; total -= len; } i += flush_num; } /* * NOTICE: Be sure reset curblk to 0 and reinitialize recycled page * if you will continue to use blocks. */ }