/* * RelationCreateStorage * Create physical storage for a relation. * * Create the underlying disk file storage for the relation. This only * creates the main fork; additional forks are created lazily by the * modules that need them. * * This function is transactional. The creation is WAL-logged, and if the * transaction aborts later on, the storage will be destroyed. */ void RelationCreateStorage(RelFileNode rnode, bool istemp) { PendingRelDelete *pending; XLogRecPtr lsn; XLogRecData rdata; xl_smgr_create xlrec; SMgrRelation srel; srel = smgropen(rnode); smgrcreate(srel, MAIN_FORKNUM, false); if (!istemp) { /* * Make an XLOG entry showing the file creation. If we abort, the * file will be dropped at abort time. */ xlrec.rnode = rnode; rdata.data = (char *) &xlrec; rdata.len = sizeof(xlrec); rdata.buffer = InvalidBuffer; rdata.next = NULL; lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata); } /* Add the relation to the list of stuff to delete at abort */ pending = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); pending->relnode = rnode; pending->isTemp = istemp; pending->atCommit = false; /* delete if abort */ pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; }
static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks = 0; SMgrRelation smgr_relation = NULL; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; int ii; XLogRecPtr loc; XLogRecPtr loc1; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; int NumberOfRelations = request->count; FileRepResyncHashEntry_s entry; ChangeTrackingResult *result = NULL; while (1) { /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); if ((result = ChangeTracking_GetChanges(request)) != NULL) { FileRepResync_ResetReadBufferRequest(); for (ii = 0; ii < result->count; ii++) { if (smgr_relation == NULL) { NumberOfRelations--; smgr_relation = smgropen(result->entries[ii].relFileNode); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); numBlocks = smgrnblocks(smgr_relation); if (Debug_filerep_print) elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' " "number of blocks:'%u' ", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode, numBlocks); thresholdCount = Min(numBlocks, 1024); } loc1 = result->entries[ii].lsn_end; /* * if relation was truncated then block_num from change tracking can be beyond numBlocks */ if (result->entries[ii].block_num >= numBlocks) { ereport(LOG, (errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), " "lsn change tracking '%s(%u/%u)' " "number of blocks '%d' ", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); goto flush_check; } /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, result->entries[ii].block_num, relidstr); FileRepResync_ResetReadBufferRequest(); Assert(result->entries[ii].block_num < numBlocks); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if(Debug_filerep_print) { elog(LOG, "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' " "lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page))) { if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) { ereport(LOG, (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, " "lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' " "number of blocks '%d'", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); } /* * It's safe and better to perform write of the page to mirror, * for this case, as primary and mirror data pages should always * be same. So, we might do some extra work but definitely won't * loose out blocks, or error out and need to perform full recovery. * Need to cover for this case as there are some known scenarios where * CT file can have extra records which should have been discarded, * but as we loose out information of xlog LSN cannot be discarded. * One such case is when CT_TRANSIENT being compacted to CT_COMPACT * with specific xlog LSN (to discard extra records) in CT mode gets * interrupted by resync. Compaction during Resync collects all the * CT records and doesn't have xlog LSN information to discard any * extra records from CT_TRANSIENT. */ smgrwrite(smgr_relation, result->entries[ii].block_num, (char *)BufferGetBlock(buf), FALSE); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif UnlockReleaseBuffer(buf); #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif flush_check: if (((ii + 1) == result->count) || ! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode && result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode && result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode)) { if (result->ask_for_more == false) { smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; FileRep_GetRelationPath( entry.fileName, result->entries[ii].relFileNode, 0 /* segment file number is always 0 for Buffer Pool */); status = FileRepResync_UpdateEntry(&entry); if (status != STATUS_OK) { break; } } } if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } // for (ii = 0; ii < result->count; ii++) } // if ((result = ChangeTracking_GetChanges(request)) != NULL) FileRepResync_ResetReadBufferRequest(); if (result != NULL && result->ask_for_more == true) { Assert(request->count == 1); request->entries[0].lsn_start = result->next_start_lsn; } else { break; } } // while(1) ChangeTracking_FreeRequest(request); ChangeTracking_FreeResult(result); Insist(NumberOfRelations == 0); if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks; BlockNumber blkno; SMgrRelation smgr_relation; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; XLogRecPtr loc; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; switch (entry->relStorageMgr) { case PersistentFileSysRelStorageMgr_BufferPool: switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_BufferPoolScanIncremental: case MirroredRelDataSynchronizationState_FullCopy: smgr_relation = smgropen(entry->relFileNode); numBlocks = smgrnblocks(smgr_relation); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); if (Debug_filerep_print) elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ", relidstr, numBlocks); thresholdCount = Min(numBlocks, 1024); /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum; } for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) { XLogRecPtr endResyncLSN = (isFullResync() ? FileRepResync_GetEndFullResyncLSN() : FileRepResync_GetEndIncrResyncLSN()); #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorkerRead, DDLNotSpecified, "", //databaseName ""); // tableName #endif FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno, relidstr); FileRepResync_ResetReadBufferRequest(); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if (Debug_filerep_print) { elog(LOG, "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' " "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(PageGetLSN(page), endResyncLSN) && XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) { smgrwrite(smgr_relation, blkno, (char *)BufferGetBlock(buf), FALSE); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif UnlockReleaseBuffer(buf); if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } if (mirrorDataLossOccurred) break; if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy) { LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); numBlocks = smgrnblocks(smgr_relation); smgrtruncate(smgr_relation, numBlocks, TRUE /* isTemp, TRUE means to not record in XLOG */, FALSE /* isLocalBuf */, &entry->persistentTid, entry->persistentSerialNum); UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); } smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; case PersistentFileSysRelStorageMgr_AppendOnly: { MirroredAppendOnlyOpen mirroredOpen; int primaryError; bool mirrorDataLossOccurred; char *buffer = NULL; int64 endOffset = entry->mirrorAppendOnlyNewEof; int64 startOffset = entry->mirrorAppendOnlyLossEof; int32 bufferLen = 0; int retval = 0; switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_AppendOnlyCatchup: case MirroredRelDataSynchronizationState_FullCopy: /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; } /* * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and * will open BOTH, but write only the MIRROR!!! */ MirroredAppendOnly_OpenResynchonize( &mirroredOpen, &entry->relFileNode, entry->segmentFileNum, startOffset, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file %u/%u/%u.%u : %s", entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum, strerror(primaryError)))); break; } if (mirrorDataLossOccurred) break; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); buffer = (char*) palloc(bufferLen); if (buffer == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("not enough memory for resynchronization")))); MemSet(buffer, 0, bufferLen); while (startOffset < endOffset) { retval = MirroredAppendOnly_Read( &mirroredOpen, buffer, bufferLen); if (retval != bufferLen) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", startOffset, entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum))); break; } MirroredAppendOnly_Append( &mirroredOpen, buffer, bufferLen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // No primary writes as resync worker. startOffset += bufferLen; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); } if (buffer) { pfree(buffer); buffer = NULL; } if (mirrorDataLossOccurred) break; /* Flush written data on Mirror */ MirroredAppendOnly_Flush( &mirroredOpen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // Not flushed on primary as resync worker. /* Close Primary and Mirror */ MirroredAppendOnly_Close( &mirroredOpen, &mirrorDataLossOccurred); break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; } //case default: Assert(0); break; } //switch if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
/* * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED */ void FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; TransactionId xid; char *buf; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId latestXid; TransactionId *children; RelFileNode *commitrels; RelFileNode *abortrels; RelFileNode *delrels; int ndelrels; int i; /* * Validate the GID, and lock the GXACT to ensure that two backends do not * try to commit the same GID at once. */ gxact = LockGXact(gid, GetUserId()); xid = gxact->proc.xid; /* * Read and validate the state file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("two-phase state file for transaction %u is corrupt", xid))); /* * Disassemble the header area */ hdr = (TwoPhaseFileHeader *) buf; Assert(TransactionIdEquals(hdr->xid, xid)); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); children = (TransactionId *) bufptr; bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); commitrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); abortrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); /* compute latestXid among all children */ latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); /* * The order of operations here is critical: make the XLOG entry for * commit or abort, then mark the transaction committed or aborted in * pg_clog, then remove its PGPROC from the global ProcArray (which means * TransactionIdIsInProgress will stop saying the prepared xact is in * progress), then run the post-commit or post-abort callbacks. The * callbacks will release the locks the transaction held. */ if (isCommit) RecordTransactionCommitPrepared(xid, hdr->nsubxacts, children, hdr->ncommitrels, commitrels); else RecordTransactionAbortPrepared(xid, hdr->nsubxacts, children, hdr->nabortrels, abortrels); ProcArrayRemove(&gxact->proc, latestXid); /* * In case we fail while running the callbacks, mark the gxact invalid so * no one else will try to commit/rollback, and so it can be recycled * properly later. It is still locked by our XID so it won't go away yet. * * (We assume it's safe to do this without taking TwoPhaseStateLock.) */ gxact->valid = false; /* * We have to remove any files that were supposed to be dropped. For * consistency with the regular xact.c code paths, must do this before * releasing locks, so do it before running the callbacks. * * NB: this code knows that we couldn't be dropping any temp rels ... */ if (isCommit) { delrels = commitrels; ndelrels = hdr->ncommitrels; } else { delrels = abortrels; ndelrels = hdr->nabortrels; } for (i = 0; i < ndelrels; i++) { SMgrRelation srel = smgropen(delrels[i]); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) { if (smgrexists(srel, fork)) smgrdounlink(srel, fork, false, false); } smgrclose(srel); } /* And now do the callbacks */ if (isCommit) ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); else ProcessRecords(bufptr, xid, twophase_postabort_callbacks); /* Count the prepared xact as committed or aborted */ AtEOXact_PgStat(isCommit); /* * And now we can clean up our mess. */ RemoveTwoPhaseFile(xid, true); RemoveGXact(gxact); pfree(buf); }
/* * LocalBufferAlloc - * Find or create a local buffer for the given page of the given relation. * * API is similar to bufmgr.c's BufferAlloc, except that we do not need * to do any locking since this is all local. Also, IO_IN_PROGRESS * does not get set. Lastly, we support only default access strategy * (hence, usage_count is always advanced). */ BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; BufferDesc *bufHdr; int b; int trycounter; bool found; INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); /* Initialize local buffers if first request in this session */ if (LocalBufHash == NULL) InitLocalBuffers(); /* See if the desired buffer already exists */ hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL); if (hresult) { b = hresult->id; bufHdr = &LocalBufferDescriptors[b]; Assert(BUFFERTAGS_EQUAL(bufHdr->tag, newTag)); #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1); #endif /* this part is equivalent to PinBuffer for a shared buffer */ if (LocalRefCount[b] == 0) { if (bufHdr->usage_count < BM_MAX_USAGE_COUNT) bufHdr->usage_count++; } LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); if (bufHdr->flags & BM_VALID) *foundPtr = TRUE; else { /* Previous read attempt must have failed; try again */ *foundPtr = FALSE; } return bufHdr; } #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -nextFreeLocalBuf - 1); #endif /* * Need to get a new buffer. We use a clock sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; for (;;) { b = nextFreeLocalBuf; if (++nextFreeLocalBuf >= NLocBuffer) nextFreeLocalBuf = 0; bufHdr = &LocalBufferDescriptors[b]; if (LocalRefCount[b] == 0) { if (bufHdr->usage_count > 0) { bufHdr->usage_count--; trycounter = NLocBuffer; } else { /* Found a usable buffer */ LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); break; } } else if (--trycounter == 0) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("no empty local buffer available"))); } /* * this buffer is not referenced but it might still be dirty. if that's * the case, write it out before reusing it! */ if (bufHdr->flags & BM_DIRTY) { SMgrRelation oreln; /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, (char *) LocalBufHdrGetBlock(bufHdr), false); /* Mark not-dirty now in case we error out below */ bufHdr->flags &= ~BM_DIRTY; pgBufferUsage.local_blks_written++; } /* * lazy memory allocation: allocate space on first use of a buffer. */ if (LocalBufHdrGetBlock(bufHdr) == NULL) { /* Set pointer for use by BufferGetBlock() macro */ LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage(); } /* * Update the hash table: remove old entry, if any, and make new one. */ if (bufHdr->flags & BM_TAG_VALID) { hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &bufHdr->tag, HASH_REMOVE, NULL); if (!hresult) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); /* mark buffer invalid just in case hash insert fails */ CLEAR_BUFFERTAG(bufHdr->tag); bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID); } hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found); if (found) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); hresult->id = b; /* * it's all ours now. */ bufHdr->tag = newTag; bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); bufHdr->flags |= BM_TAG_VALID; bufHdr->usage_count = 1; *foundPtr = FALSE; return bufHdr; }
/* * XLogReadBufferExtended * Read a page during XLOG replay * * This is functionally comparable to ReadBufferExtended. There's some * differences in the behavior wrt. the "mode" argument: * * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we * return InvalidBuffer. In this case the caller should silently skip the * update on this page. (In this situation, we expect that the page was later * dropped or truncated. If we don't see evidence of that later in the WAL * sequence, we'll complain at the end of WAL replay.) * * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended * with all-zeroes pages up to the given block number. * * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't * exist, and we don't check for all-zeroes. Thus, no log entry is made * to imply that the page should be dropped or truncated later. * * NB: A redo function should normally not call this directly. To get a page * to modify, use XLogReplayBuffer instead. It is important that all pages * modified by a WAL record are registered in the WAL records, or they will be * invisible to tools that that need to know which pages are modified. */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode) { BlockNumber lastblock; Buffer buffer; SMgrRelation smgr; Assert(blkno != P_NEW); /* Open the relation at smgr level */ smgr = smgropen(rnode, InvalidBackendId); /* * Create the target file if it doesn't already exist. This lets us cope * if the replay sequence contains writes to a relation that is later * deleted. (The original coding of this routine would instead suppress * the writes, but that seems like it risks losing valuable data if the * filesystem loses an inode during a crash. Better to write the data * until we are actually told to delete the file.) */ smgrcreate(smgr, forknum, true); lastblock = smgrnblocks(smgr, forknum); if (blkno < lastblock) { /* page exists in file */ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL); } else { /* hm, page doesn't exist in file */ if (mode == RBM_NORMAL) { log_invalid_page(rnode, forknum, blkno, false); return InvalidBuffer; } if (mode == RBM_NORMAL_NO_LOG) return InvalidBuffer; /* OK to extend the file */ /* we do this in recovery only - no rel-extension lock needed */ Assert(InRecovery); buffer = InvalidBuffer; do { if (buffer != InvalidBuffer) { if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL); } while (BufferGetBlockNumber(buffer) < blkno); /* Handle the corner case that P_NEW returns non-consecutive pages */ if (BufferGetBlockNumber(buffer) != blkno) { if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL); } } if (mode == RBM_NORMAL) { /* check that page has been initialized */ Page page = (Page) BufferGetPage(buffer); /* * We assume that PageIsNew is safe without a lock. During recovery, * there should be no other backends that could modify the buffer at * the same time. */ if (PageIsNew(page)) { ReleaseBuffer(buffer); log_invalid_page(rnode, forknum, blkno, true); return InvalidBuffer; } } return buffer; }
/* * mdsync() -- Sync previous writes to stable storage. */ void mdsync(void) { static bool mdsync_in_progress = false; HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; int absorb_counter; /* * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just * before it was visited by BufferSync(). We know the backend will have * queued an fsync request before clearing the buffer's dirtybit, so we * are safe as long as we do an Absorb after completing BufferSync(). */ AbsorbFsyncRequests(); /* * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, * instead. We use mdsync_cycle_ctr to tell old entries apart from new * ones: new ones will have cycle_ctr equal to the incremented value of * mdsync_cycle_ctr. * * In normal circumstances, all entries present in the table at this point * will have cycle_ctr exactly equal to the current (about to be old) * value of mdsync_cycle_ctr. However, if we fail partway through the * fsync'ing loop, then older values of cycle_ctr might remain when we * come back here to try again. Repeated checkpoint failures would * eventually wrap the counter around to the point where an old entry * might appear new, causing us to skip it, possibly allowing a checkpoint * to succeed that should not have. To forestall wraparound, any time the * previous mdsync() failed to complete, run through the table and * forcibly set cycle_ctr = mdsync_cycle_ctr. * * Think not to merge this loop with the main loop, as the problem is * exactly that that loop may fail before having visited all the entries. * From a performance point of view it doesn't matter anyway, as this path * will never be taken in a system that's functioning normally. */ if (mdsync_in_progress) { /* prior try failed, so update any stale cycle_ctr values */ hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { entry->cycle_ctr = mdsync_cycle_ctr; } } /* Advance counter so that new hashtable entries are distinguishable */ mdsync_cycle_ctr++; /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If the entry is new then don't process it this time. Note that * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) continue; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) Also, if the entry is marked canceled, * fall through to delete it. */ if (enableFsync && !entry->canceled) { int failures; /* * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to * process them anyway. */ if (--absorb_counter <= 0) { AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; } /* * The fsync table could contain requests to fsync segments that * have been deleted (unlinked) by the time we get to them. Rather * than just hoping an ENOENT (or EACCES on Windows) error can be * ignored, what we do on error is absorb pending requests and * then retry. Since mdunlink() queues a "revoke" message before * actually unlinking, the fsync request is guaranteed to be * marked canceled after the absorb if it really was this case. * DROP DATABASE likewise has to tell us to forget fsync requests * before it starts deletions. */ for (failures = 0;; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; char *path; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference * isn't permanently leaked if we get an error here. (You may * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. */ reln = smgropen(entry->tag.rnode); /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, * allow ENOENT, but only if we didn't fail already on this * file. This applies both during _mdfd_getseg() and during * FileSync, since fd.c might have closed the file behind our * back. */ seg = _mdfd_getseg(reln, entry->tag.forknum, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0) break; /* success; break out of retry loop */ /* * XXX is there any point in allowing more than one retry? * Don't see one at the moment, but easy to change the test * here if so. */ path = _mdfd_segpath(reln, entry->tag.forknum, entry->tag.segno); if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); else ereport(DEBUG1, (errcode_for_file_access(), errmsg("could not fsync file \"%s\" but retrying: %m", path))); pfree(path); /* * Absorb incoming requests and check to see if canceled. */ AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ if (entry->canceled) break; } /* end retry loop */ } /* * If we get here, either we fsync'd successfully, or we don't have to * because enableFsync is off, or the entry is (now) marked canceled. * Okay to delete it. */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } /* end loop over hashtable entries */ /* Flag successful completion of mdsync */ mdsync_in_progress = false; }
void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; reln = smgropen(xlrec->rnode); smgrcreate(reln, false, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; BlockNumber newblks; reln = smgropen(xlrec->rnode); /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in * XLogOpenRelation, we prefer to recreate the rel and replay the * log as best we can until the drop is seen. */ smgrcreate(reln, false, true); /* Can't use smgrtruncate because it would try to xlog */ /* * First, force bufmgr to drop any buffers it has for the to-be- * truncated blocks. We must do this, else subsequent XLogReadBuffer * operations will not re-extend the file properly. */ DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno); /* * Tell the free space map to forget anything it may have stored for * the about-to-be-deleted blocks. We want to be sure it won't return * bogus block numbers later on. */ FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno); /* Do the truncation */ newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, xlrec->blkno, false); if (newblks == InvalidBlockNumber) ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, xlrec->blkno))); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, xlrec->blkno); } else elog(PANIC, "smgr_redo: unknown op code %u", info); }
void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; /* Backup blocks are not used in smgr records */ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; reln = smgropen(xlrec->rnode, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; reln = smgropen(xlrec->rnode, InvalidBackendId); /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in * XLogReadBuffer, we prefer to recreate the rel and replay the log as * best we can until the drop is seen. */ smgrcreate(reln, MAIN_FORKNUM, true); /* * Before we perform the truncation, update minimum recovery point * to cover this WAL record. Once the relation is truncated, there's * no going back. The buffer manager enforces the WAL-first rule * for normal updates to relation files, so that the minimum recovery * point is always updated before the corresponding change in the * data file is flushed to disk. We have to do the same manually * here. * * Doing this before the truncation means that if the truncation fails * for some reason, you cannot start up the system even after restart, * until you fix the underlying situation so that the truncation will * succeed. Alternatively, we could update the minimum recovery point * after truncation, but that would leave a small window where the * WAL-first rule could be violated. */ XLogFlush(lsn); smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); /* Truncate FSM and VM too */ rel = CreateFakeRelcacheEntry(xlrec->rnode); if (smgrexists(reln, FSM_FORKNUM)) FreeSpaceMapTruncateRel(rel, xlrec->blkno); if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) visibilitymap_truncate(rel, xlrec->blkno); FreeFakeRelcacheEntry(rel); } else elog(PANIC, "smgr_redo: unknown op code %u", info); }
/* * mdsync() -- Sync previous writes to stable storage. * * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ bool mdsync(void) { HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; if (!pendingOpsTable) return false; /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends before the checkpoint REDO point * was determined. We go that a little better by accepting all requests * queued up to the point where we start fsync'ing. */ AbsorbFsyncRequests(); hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) */ if (enableFsync) { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This may * seem a bit unclean -- md calling smgr? But it's really the * best solution. It ensures that the open file reference isn't * permanently leaked if we get an error here. (You may say "but * an unreferenced SMgrRelation is still a leak!" Not really, * because the only case in which a checkpoint is done by a * process that isn't about to shut down is in the bgwriter, and * it will periodically do smgrcloseall(). This fact justifies * our not closing the reln in the success path either, which is a * good thing since in non-bgwriter cases we couldn't safely do * that.) Furthermore, in many cases the relation will have been * dirtied through this same smgr relation, and so we can save a * file open/close cycle. */ reln = smgropen(entry->rnode); /* * It is possible that the relation has been dropped or truncated * since the fsync request was entered. Therefore, we have to * allow file-not-found errors. This applies both during * _mdfd_getseg() and during FileSync, since fd.c might have * closed the file behind our back. */ seg = _mdfd_getseg(reln, entry->segno * ((BlockNumber) RELSEG_SIZE), true); if (seg) { if (FileSync(seg->mdfd_vfd) < 0 && errno != ENOENT) { ereport(LOG, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", entry->segno, entry->rnode.spcNode, entry->rnode.dbNode, entry->rnode.relNode))); return false; } } } /* Okay, delete this entry */ if (hash_search(pendingOpsTable, entry, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } return true; }
static int64 PersistentBuild_TruncateAllGpRelationNode(void) { Relation pg_database; HeapScanDesc scan; HeapTuple tuple; int64 count; pg_database = heap_open( DatabaseRelationId, AccessShareLock); /* * Truncate gp_relation_node and its index in each database. */ scan = heap_beginscan(pg_database, SnapshotNow, 0, NULL); count = 0; while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Form_pg_database form_pg_database = (Form_pg_database)GETSTRUCT(tuple); Oid dbOid; Oid dattablespace; RelFileNode relFileNode; SMgrRelation smgrRelation; Page btree_metapage; dbOid = HeapTupleGetOid(tuple); dattablespace = form_pg_database->dattablespace; if (dbOid == HcatalogDbOid) continue; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "PersistentBuild_TruncateAllGpRelationNode: dbOid %u, '%s'", dbOid, form_pg_database->datname.data); if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Truncating gp_relation_node %u/%u/%u in database oid %u ('%s')", relFileNode.spcNode, relFileNode.dbNode, relFileNode.relNode, dbOid, form_pg_database->datname.data); relFileNode.spcNode = dattablespace; relFileNode.dbNode = dbOid; relFileNode.relNode = GpRelfileNodeRelationId; /* * Truncate WITHOUT generating an XLOG record (i.e. pretend it is a temp relation). */ PersistentBuild_NonTransactionTruncate(&relFileNode); count++; /* * And, the index. Unfortunately, the relfilenode OID can change due to a * REINDEX {TABLE|INDEX} command. */ PersistentBuild_FindGpRelationNodeIndex( dbOid, dattablespace, &relFileNode); if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Truncating gp_relation_node_index %u/%u/%u in database oid %u ('%s'). relfilenode different %s, tablespace different %s", relFileNode.spcNode, relFileNode.dbNode, relFileNode.relNode, dbOid, form_pg_database->datname.data, ((relFileNode.relNode != GpRelfileNodeOidIndexId) ? "true" : "false"), ((relFileNode.spcNode != dattablespace) ? "true" : "false")); PersistentBuild_NonTransactionTruncate(&relFileNode); // The BTree needs an empty meta-data block. smgrRelation = smgropen(relFileNode); btree_metapage = (Page)palloc(BLCKSZ); _bt_initmetapage(btree_metapage, P_NONE, 0); smgrwrite( smgrRelation, /* blockNum */ 0, (char*)btree_metapage, /* isTemp */ false); smgrimmedsync(smgrRelation); pfree(btree_metapage); smgrclose(smgrRelation); count++; } heap_endscan(scan); heap_close(pg_database, AccessShareLock); return count; }
/* * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. * * This also runs when aborting a subxact; we want to clean up a failed * subxact immediately. * * Note: It's possible that we're being asked to remove a relation that has * no physical storage in any fork. In particular, it's possible that we're * cleaning up an old temporary relation for which RemovePgTempFiles has * already recovered the physical storage. */ void smgrDoPendingDeletes(bool isCommit) { int nestLevel = GetCurrentTransactionNestLevel(); PendingRelDelete *pending; PendingRelDelete *prev; PendingRelDelete *next; int nrels = 0, i = 0, maxrels = 0; SMgrRelation *srels = NULL; prev = NULL; for (pending = pendingDeletes; pending != NULL; pending = next) { next = pending->next; if (pending->nestLevel < nestLevel) { /* outer-level entries should not be processed yet */ prev = pending; } else { /* unlink list entry first, so we don't retry on failure */ if (prev) prev->next = next; else pendingDeletes = next; /* do deletion if called for */ if (pending->atCommit == isCommit) { SMgrRelation srel; srel = smgropen(pending->relnode, pending->backend); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) { maxrels = 8; srels = palloc(sizeof(SMgrRelation) * maxrels); } else if (maxrels <= nrels) { maxrels *= 2; srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); } srels[nrels++] = srel; } /* must explicitly free the list entry */ pfree(pending); /* prev does not change */ } } if (nrels > 0) { smgrdounlinkall(srels, nrels, false); for (i = 0; i < nrels; i++) smgrclose(srels[i]); pfree(srels); } }