static void spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr; OffsetNumber *toDelete; Buffer buffer; Page page; ptr += sizeof(spgxlogVacuumRoot); toDelete = (OffsetNumber *) ptr; if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, SPGIST_HEAD_BLKNO, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { /* The tuple numbers are in order */ PageIndexMultiDelete(page, toDelete, xldata->nDelete); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
/* * Returns the replication apply delay in ms */ int GetReplicationApplyDelay(void) { /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; XLogRecPtr receivePtr; XLogRecPtr replayPtr; long secs; int usecs; SpinLockAcquire(&walrcv->mutex); receivePtr = walrcv->receivedUpto; SpinLockRelease(&walrcv->mutex); replayPtr = GetXLogReplayRecPtr(NULL); if (XLByteLE(receivePtr, replayPtr)) return 0; TimestampDifference(GetCurrentChunkReplayStartTime(), GetCurrentTimestamp(), &secs, &usecs); return (((int) secs * 1000) + (usecs / 1000)); }
/* * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, * but not yet sent to the client, and buffer it in the libpq output * buffer. * * msgbuf is a work area in which the output message is constructed. It's * passed in just so we can avoid re-palloc'ing the buffer on each cycle. * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE. * * If there is no unsent WAL remaining, *caughtup is set to true, otherwise * *caughtup is set to false. */ static void XLogSend(char *msgbuf, bool *caughtup) { XLogRecPtr SendRqstPtr; XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes; WalDataMessageHeader msghdr; /* * Attempt to send all data that's already been written out and fsync'd to * disk. We cannot go further than what's been written out given the * current implementation of XLogRead(). And in any case it's unsafe to * send WAL that is not securely down to disk on the master: if the master * subsequently crashes and restarts, slaves must not have applied any WAL * that gets lost on the master. */ SendRqstPtr = am_cascading_walsender ? GetStandbyFlushRecPtr() : GetFlushRecPtr(); /* Quick exit if nothing to do */ if (XLByteLE(SendRqstPtr, sentPtr)) { *caughtup = true; return; } /* * Figure out how much to send in one message. If there's no more than * MAX_SEND_SIZE bytes to send, send everything. Otherwise send * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. * * The rounding is not only for performance reasons. Walreceiver relies on * the fact that we never split a WAL record across two messages. Since a * long WAL record is split at page boundary into continuation records, * page boundary is always a safe cut-off point. We also assume that * SendRqstPtr never points to the middle of a WAL record. */ startptr = sentPtr; if (startptr.xrecoff >= XLogFileSize) { /* * crossing a logid boundary, skip the non-existent last log segment * in previous logical log file. */ startptr.xlogid += 1; startptr.xrecoff = 0; } endptr = startptr; XLByteAdvance(endptr, MAX_SEND_SIZE); if (endptr.xlogid != startptr.xlogid) { /* Don't cross a logfile boundary within one message */ Assert(endptr.xlogid == startptr.xlogid + 1); endptr.xlogid = startptr.xlogid; endptr.xrecoff = XLogFileSize; } /* if we went beyond SendRqstPtr, back off */ if (XLByteLE(SendRqstPtr, endptr)) { endptr = SendRqstPtr; *caughtup = true; } else { /* round down to page boundary. */ endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ); *caughtup = false; } nbytes = endptr.xrecoff - startptr.xrecoff; Assert(nbytes <= MAX_SEND_SIZE); /* * OK to read and send the slice. */ msgbuf[0] = 'w'; /* * Read the log directly into the output buffer to avoid extra memcpy * calls. */ XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes); /* * We fill the message header last so that the send timestamp is taken as * late as possible. */ msghdr.dataStart = startptr; msghdr.walEnd = SendRqstPtr; msghdr.sendTime = GetCurrentTimestamp(); memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader)); pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes); sentPtr = endptr; /* Update shared memory status */ { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; SpinLockAcquire(&walsnd->mutex); walsnd->sentPtr = sentPtr; SpinLockRelease(&walsnd->mutex); } /* Report progress of XLOG streaming in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", sentPtr.xlogid, sentPtr.xrecoff); set_ps_display(activitymsg, false); } return; }
static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks = 0; SMgrRelation smgr_relation = NULL; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; int ii; XLogRecPtr loc; XLogRecPtr loc1; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; int NumberOfRelations = request->count; FileRepResyncHashEntry_s entry; ChangeTrackingResult *result = NULL; while (1) { /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); if ((result = ChangeTracking_GetChanges(request)) != NULL) { FileRepResync_ResetReadBufferRequest(); for (ii = 0; ii < result->count; ii++) { if (smgr_relation == NULL) { NumberOfRelations--; smgr_relation = smgropen(result->entries[ii].relFileNode); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); numBlocks = smgrnblocks(smgr_relation); if (Debug_filerep_print) elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' " "number of blocks:'%u' ", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode, numBlocks); thresholdCount = Min(numBlocks, 1024); } loc1 = result->entries[ii].lsn_end; /* * if relation was truncated then block_num from change tracking can be beyond numBlocks */ if (result->entries[ii].block_num >= numBlocks) { ereport(LOG, (errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), " "lsn change tracking '%s(%u/%u)' " "number of blocks '%d' ", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); goto flush_check; } /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, result->entries[ii].block_num, relidstr); FileRepResync_ResetReadBufferRequest(); Assert(result->entries[ii].block_num < numBlocks); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if(Debug_filerep_print) { elog(LOG, "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' " "lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page))) { if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) { ereport(LOG, (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, " "lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' " "number of blocks '%d'", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); } /* * It's safe and better to perform write of the page to mirror, * for this case, as primary and mirror data pages should always * be same. So, we might do some extra work but definitely won't * loose out blocks, or error out and need to perform full recovery. * Need to cover for this case as there are some known scenarios where * CT file can have extra records which should have been discarded, * but as we loose out information of xlog LSN cannot be discarded. * One such case is when CT_TRANSIENT being compacted to CT_COMPACT * with specific xlog LSN (to discard extra records) in CT mode gets * interrupted by resync. Compaction during Resync collects all the * CT records and doesn't have xlog LSN information to discard any * extra records from CT_TRANSIENT. */ smgrwrite(smgr_relation, result->entries[ii].block_num, (char *)BufferGetBlock(buf), FALSE); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif UnlockReleaseBuffer(buf); #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif flush_check: if (((ii + 1) == result->count) || ! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode && result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode && result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode)) { if (result->ask_for_more == false) { smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; FileRep_GetRelationPath( entry.fileName, result->entries[ii].relFileNode, 0 /* segment file number is always 0 for Buffer Pool */); status = FileRepResync_UpdateEntry(&entry); if (status != STATUS_OK) { break; } } } if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } // for (ii = 0; ii < result->count; ii++) } // if ((result = ChangeTracking_GetChanges(request)) != NULL) FileRepResync_ResetReadBufferRequest(); if (result != NULL && result->ask_for_more == true) { Assert(request->count == 1); request->entries[0].lsn_start = result->next_start_lsn; } else { break; } } // while(1) ChangeTracking_FreeRequest(request); ChangeTracking_FreeResult(result); Insist(NumberOfRelations == 0); if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks; BlockNumber blkno; SMgrRelation smgr_relation; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; XLogRecPtr loc; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; switch (entry->relStorageMgr) { case PersistentFileSysRelStorageMgr_BufferPool: switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_BufferPoolScanIncremental: case MirroredRelDataSynchronizationState_FullCopy: smgr_relation = smgropen(entry->relFileNode); numBlocks = smgrnblocks(smgr_relation); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); if (Debug_filerep_print) elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ", relidstr, numBlocks); thresholdCount = Min(numBlocks, 1024); /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum; } for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) { XLogRecPtr endResyncLSN = (isFullResync() ? FileRepResync_GetEndFullResyncLSN() : FileRepResync_GetEndIncrResyncLSN()); #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorkerRead, DDLNotSpecified, "", //databaseName ""); // tableName #endif FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno, relidstr); FileRepResync_ResetReadBufferRequest(); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if (Debug_filerep_print) { elog(LOG, "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' " "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(PageGetLSN(page), endResyncLSN) && XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) { smgrwrite(smgr_relation, blkno, (char *)BufferGetBlock(buf), FALSE); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif UnlockReleaseBuffer(buf); if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } if (mirrorDataLossOccurred) break; if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy) { LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); numBlocks = smgrnblocks(smgr_relation); smgrtruncate(smgr_relation, numBlocks, TRUE /* isTemp, TRUE means to not record in XLOG */, FALSE /* isLocalBuf */, &entry->persistentTid, entry->persistentSerialNum); UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); } smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; case PersistentFileSysRelStorageMgr_AppendOnly: { MirroredAppendOnlyOpen mirroredOpen; int primaryError; bool mirrorDataLossOccurred; char *buffer = NULL; int64 endOffset = entry->mirrorAppendOnlyNewEof; int64 startOffset = entry->mirrorAppendOnlyLossEof; int32 bufferLen = 0; int retval = 0; switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_AppendOnlyCatchup: case MirroredRelDataSynchronizationState_FullCopy: /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; } /* * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and * will open BOTH, but write only the MIRROR!!! */ MirroredAppendOnly_OpenResynchonize( &mirroredOpen, &entry->relFileNode, entry->segmentFileNum, startOffset, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file %u/%u/%u.%u : %s", entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum, strerror(primaryError)))); break; } if (mirrorDataLossOccurred) break; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); buffer = (char*) palloc(bufferLen); if (buffer == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("not enough memory for resynchronization")))); MemSet(buffer, 0, bufferLen); while (startOffset < endOffset) { retval = MirroredAppendOnly_Read( &mirroredOpen, buffer, bufferLen); if (retval != bufferLen) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", startOffset, entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum))); break; } MirroredAppendOnly_Append( &mirroredOpen, buffer, bufferLen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // No primary writes as resync worker. startOffset += bufferLen; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); } if (buffer) { pfree(buffer); buffer = NULL; } if (mirrorDataLossOccurred) break; /* Flush written data on Mirror */ MirroredAppendOnly_Flush( &mirroredOpen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // Not flushed on primary as resync worker. /* Close Primary and Mirror */ MirroredAppendOnly_Close( &mirroredOpen, &mirrorDataLossOccurred); break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; } //case default: Assert(0); break; } //switch if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
/* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * * We must fsync the state file of any GXACT that is valid and has a PREPARE * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or * has a later LSN, this checkpoint is not responsible for fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite * possible that GXACTs that were valid at checkpoint start will no longer * exist if we wait a little bit. * * If a GXACT remains valid across multiple checkpoints, it'll be fsynced * each time. This is considered unusual enough that we don't bother to * expend any extra code to avoid the redundant fsyncs. (They should be * reasonably cheap anyway, since they won't cause I/O.) */ void CheckPointTwoPhase(XLogRecPtr redo_horizon) { TransactionId *xids; int nxids; char path[MAXPGPATH]; int i; /* * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab * it just long enough to make a list of the XIDs that require fsyncing, * and then do the I/O afterwards. * * This approach creates a race condition: someone else could delete a * GXACT between the time we release TwoPhaseStateLock and the time we try * to open its state file. We handle this by special-casing ENOENT * failures: if we see that, we verify that the GXACT is no longer valid, * and if so ignore the failure. */ if (max_prepared_xacts <= 0) return; /* nothing to do */ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); nxids = 0; LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if (gxact->valid && XLByteLE(gxact->prepare_lsn, redo_horizon)) xids[nxids++] = gxact->proc.xid; } LWLockRelease(TwoPhaseStateLock); for (i = 0; i < nxids; i++) { TransactionId xid = xids[i]; int fd; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) { /* OK if gxact is no longer valid */ if (!TransactionIdIsPrepared(xid)) continue; /* Restore errno in case it was changed */ errno = ENOENT; } ereport(ERROR, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); } if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", path))); } pfree(xids); TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; /* * Fast exit if user has not requested sync replication, or there are no * sync replication standby names defined. Note that those standbys don't * need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); /* Reset the latch before adding ourselves to the queue. */ ResetLatch(&MyProc->waitLatch); LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not * set. See SyncRepUpdateSyncStandbysDefined. * * Also check that the standby hasn't already replied. Unlikely race * condition but we'll be fetching that cache line anyway so its likely to * be a low cost check. */ if (!WalSndCtl->sync_standbys_defined || XLByteLE(XactCommitLSN, WalSndCtl->lsn)) { LWLockRelease(SyncRepLock); return; } /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(); Assert(SyncRepQueueIsOrderedByLSN()); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* * Wait on latch for up to 60 seconds. This allows us to check for * postmaster death regularly while waiting. Note that timeout here * does not necessarily release from loop. */ WaitLatch(&MyProc->waitLatch, 60000000L); /* Must reset the latch before testing state. */ ResetLatch(&MyProc->waitLatch); /* * Try checking the state without the lock first. There's no * guarantee that we'll read the most up-to-date value, so if it looks * like we're still waiting, recheck while holding the lock. But if * it looks like we're done, we must really be done, because once * walsender changes the state to SYNC_REP_WAIT_COMPLETE, it will * never update it again, so we can't be seeing a stale value in that * case. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter would * lead the client to believe that that the transaction aborted, which * is not true: it's already committed locally. The former is no good * either: the client has requested synchronous replication, and is * entitled to assume that an acknowledged commit is also replicated, * which may not be true. So in this case we issue a WARNING (which * some clients may be able to interpret) and shut off further output. * We do NOT reset ProcDiePending, so that the process will die after * the commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with a * suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an * acknowledgement, because all the wal sender processes will exit. So * just bail out. */ if (!PostmasterIsAlive(true)) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
static void spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr; SpGistLeafTuple leafTuple; Buffer buffer; Page page; /* we assume this is adequately aligned */ ptr += sizeof(spgxlogAddLeaf); leafTuple = (SpGistLeafTuple) ptr; if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, SPGIST_LEAF); if (!XLByteLE(lsn, PageGetLSN(page))) { /* insert new tuple */ if (xldata->offnumLeaf != xldata->offnumHeadLeaf) { /* normal cases, tuple was added by SpGistPageAddNewItem */ addOrReplaceTuple(page, (Item) leafTuple, leafTuple->size, xldata->offnumLeaf); /* update head tuple's chain link if needed */ if (xldata->offnumHeadLeaf != InvalidOffsetNumber) { SpGistLeafTuple head; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumHeadLeaf)); Assert(head->nextOffset == leafTuple->nextOffset); head->nextOffset = xldata->offnumLeaf; } } else { /* replacing a DEAD tuple */ PageIndexTupleDelete(page, xldata->offnumLeaf); if (PageAddItem(page, (Item) leafTuple, leafTuple->size, xldata->offnumLeaf, false, false) != xldata->offnumLeaf) elog(ERROR, "failed to add item of size %u to SPGiST index page", leafTuple->size); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* update parent downlink if necessary */ if (xldata->blknoParent != InvalidBlockNumber && !(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple tuple; tuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, xldata->blknoLeaf, xldata->offnumLeaf); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr; OffsetNumber *itemToPlaceholder; Buffer buffer; Page page; ptr += sizeof(spgxlogVacuumRedirect); itemToPlaceholder = (OffsetNumber *) ptr; if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistPageOpaque opaque = SpGistPageGetOpaque(page); int i; /* Convert redirect pointers to plain placeholders */ for (i = 0; i < xldata->nToPlaceholder; i++) { SpGistDeadTuple dt; dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, itemToPlaceholder[i])); Assert(dt->tupstate == SPGIST_REDIRECT); dt->tupstate = SPGIST_PLACEHOLDER; ItemPointerSetInvalid(&dt->pointer); } Assert(opaque->nRedirection >= xldata->nToPlaceholder); opaque->nRedirection -= xldata->nToPlaceholder; opaque->nPlaceholder += xldata->nToPlaceholder; /* Remove placeholder tuples at end of page */ if (xldata->firstPlaceholder != InvalidOffsetNumber) { int max = PageGetMaxOffsetNumber(page); OffsetNumber *toDelete; toDelete = palloc(sizeof(OffsetNumber) * max); for (i = xldata->firstPlaceholder; i <= max; i++) toDelete[i - xldata->firstPlaceholder] = i; i = max - xldata->firstPlaceholder + 1; Assert(opaque->nPlaceholder >= i); opaque->nPlaceholder -= i; /* The array is sorted, so can use PageIndexMultiDelete */ PageIndexMultiDelete(page, toDelete, i); pfree(toDelete); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr; OffsetNumber *toDead; OffsetNumber *toPlaceholder; OffsetNumber *moveSrc; OffsetNumber *moveDest; OffsetNumber *chainSrc; OffsetNumber *chainDest; SpGistState state; Buffer buffer; Page page; int i; fillFakeState(&state, xldata->stateSrc); ptr += sizeof(spgxlogVacuumLeaf); toDead = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nDead; toPlaceholder = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nPlaceholder; moveSrc = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nMove; moveDest = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nMove; chainSrc = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nChain; chainDest = (OffsetNumber *) ptr; if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { spgPageIndexMultiDelete(&state, page, toDead, xldata->nDead, SPGIST_DEAD, SPGIST_DEAD, InvalidBlockNumber, InvalidOffsetNumber); spgPageIndexMultiDelete(&state, page, toPlaceholder, xldata->nPlaceholder, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* see comments in vacuumLeafPage() */ for (i = 0; i < xldata->nMove; i++) { ItemId idSrc = PageGetItemId(page, moveSrc[i]); ItemId idDest = PageGetItemId(page, moveDest[i]); ItemIdData tmp; tmp = *idSrc; *idSrc = *idDest; *idDest = tmp; } spgPageIndexMultiDelete(&state, page, moveSrc, xldata->nMove, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); for (i = 0; i < xldata->nChain; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, chainSrc[i])); Assert(lt->tupstate == SPGIST_LIVE); lt->nextOffset = chainDest[i]; } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; SpGistInnerTuple innerTuple; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; uint8 *leafPageSelect; Buffer srcBuffer; Buffer destBuffer; Page page; int bbi; int i; fillFakeState(&state, xldata->stateSrc); ptr += MAXALIGN(sizeof(spgxlogPickSplit)); innerTuple = (SpGistInnerTuple) ptr; ptr += innerTuple->size; toDelete = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nDelete); toInsert = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nInsert); leafPageSelect = (uint8 *) ptr; ptr += MAXALIGN(sizeof(uint8) * xldata->nInsert); /* now ptr points to the list of leaf tuples */ /* * It's a bit tricky to identify which pages have been handled as * full-page images, so we explicitly count each referenced buffer. */ bbi = 0; if (xldata->blknoSrc == SPGIST_HEAD_BLKNO) { /* when splitting root, we touch it only in the guise of new inner */ srcBuffer = InvalidBuffer; } else if (xldata->initSrc) { /* just re-init the source page */ srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true); Assert(BufferIsValid(srcBuffer)); page = (Page) BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, SPGIST_LEAF); /* don't update LSN etc till we're done with it */ } else { /* delete the specified tuples from source page */ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(srcBuffer)) { page = BufferGetPage(srcBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { /* * We have it a bit easier here than in doPickSplit(), * because we know the inner tuple's location already, * so we can inject the correct redirection tuple now. */ if (!state.isBuild) spgPageIndexMultiDelete(&state, page, toDelete, xldata->nDelete, SPGIST_REDIRECT, SPGIST_PLACEHOLDER, xldata->blknoInner, xldata->offnumInner); else spgPageIndexMultiDelete(&state, page, toDelete, xldata->nDelete, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* don't update LSN etc till we're done with it */ } } } else srcBuffer = InvalidBuffer; bbi++; } /* try to access dest page if any */ if (xldata->blknoDest == InvalidBlockNumber) { destBuffer = InvalidBuffer; } else if (xldata->initDest) { /* just re-init the dest page */ destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true); Assert(BufferIsValid(destBuffer)); page = (Page) BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, SPGIST_LEAF); /* don't update LSN etc till we're done with it */ } else { if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false); else destBuffer = InvalidBuffer; bbi++; } /* restore leaf tuples to src and/or dest page */ for (i = 0; i < xldata->nInsert; i++) { SpGistLeafTuple lt = (SpGistLeafTuple) ptr; Buffer leafBuffer; ptr += lt->size; leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer; if (!BufferIsValid(leafBuffer)) continue; /* no need to touch this page */ page = BufferGetPage(leafBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); } } /* Now update src and dest page LSNs */ if (BufferIsValid(srcBuffer)) { page = BufferGetPage(srcBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(srcBuffer); } UnlockReleaseBuffer(srcBuffer); } if (BufferIsValid(destBuffer)) { page = BufferGetPage(destBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(destBuffer); } UnlockReleaseBuffer(destBuffer); } /* restore new inner tuple */ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoInner, xldata->initInner); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->initInner) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, xldata->offnumInner); /* if inner is also parent, update link while we're here */ if (xldata->blknoInner == xldata->blknoParent) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, xldata->blknoInner, xldata->offnumInner); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } bbi++; /* update parent downlink, unless we did it above */ if (xldata->blknoParent == InvalidBlockNumber) { /* no parent cause we split the root */ Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO); } else if (xldata->blknoInner != xldata->blknoParent) { if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, xldata->blknoInner, xldata->offnumInner); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } }
static void spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr; SpGistInnerTuple prefixTuple; SpGistInnerTuple postfixTuple; Buffer buffer; Page page; /* we assume this is adequately aligned */ ptr += sizeof(spgxlogSplitTuple); prefixTuple = (SpGistInnerTuple) ptr; ptr += prefixTuple->size; postfixTuple = (SpGistInnerTuple) ptr; /* insert postfix tuple first to avoid dangling link */ if (xldata->blknoPostfix != xldata->blknoPrefix && !(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) postfixTuple, postfixTuple->size, xldata->offnumPostfix); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* now handle the original page */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageIndexTupleDelete(page, xldata->offnumPrefix); if (PageAddItem(page, (Item) prefixTuple, prefixTuple->size, xldata->offnumPrefix, false, false) != xldata->offnumPrefix) elog(ERROR, "failed to add item of size %u to SPGiST index page", prefixTuple->size); if (xldata->blknoPostfix == xldata->blknoPrefix) addOrReplaceTuple(page, (Item) postfixTuple, postfixTuple->size, xldata->offnumPostfix); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; SpGistInnerTuple innerTuple; SpGistState state; Buffer buffer; Page page; int bbi; /* we assume this is adequately aligned */ ptr += sizeof(spgxlogAddNode); innerTuple = (SpGistInnerTuple) ptr; fillFakeState(&state, xldata->stateSrc); if (xldata->blknoNew == InvalidBlockNumber) { /* update in place */ Assert(xldata->blknoParent == InvalidBlockNumber); if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) innerTuple, innerTuple->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", innerTuple->size); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } else { /* Install new tuple first so redirect is valid */ if (!(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoNew, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, xldata->offnumNew); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* Delete old tuple, replacing it with redirect or placeholder tuple */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistDeadTuple dt; if (state.isBuild) dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); else dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, xldata->blknoNew, xldata->offnumNew); PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) dt, dt->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", dt->size); if (state.isBuild) SpGistPageGetOpaque(page)->nPlaceholder++; else SpGistPageGetOpaque(page)->nRedirection++; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* * Update parent downlink. Since parent could be in either of the * previous two buffers, it's a bit tricky to determine which BKP bit * applies. */ if (xldata->blknoParent == xldata->blkno) bbi = 0; else if (xldata->blknoParent == xldata->blknoNew) bbi = 1; else bbi = 2; if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple innerTuple; innerTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(innerTuple, xldata->nodeI, xldata->blknoNew, xldata->offnumNew); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } }
static void spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; int nInsert; Buffer buffer; Page page; fillFakeState(&state, xldata->stateSrc); nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1; ptr += MAXALIGN(sizeof(spgxlogMoveLeafs)); toDelete = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nMoves); toInsert = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * nInsert); /* now ptr points to the list of leaf tuples */ /* Insert tuples on the dest page (do first, so redirect is valid) */ if (!(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoDst, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, SPGIST_LEAF); if (!XLByteLE(lsn, PageGetLSN(page))) { int i; for (i = 0; i < nInsert; i++) { SpGistLeafTuple lt = (SpGistLeafTuple) ptr; addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); ptr += lt->size; } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* Delete tuples from the source page, inserting a redirection pointer */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, SPGIST_PLACEHOLDER, xldata->blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* And update the parent downlink */ if (!(record->xl_info & XLR_BKP_BLOCK_3)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple tuple; tuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, xldata->blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void do_failover(void) { PGresult *res1; PGresult *res2; char sqlquery[8192]; int total_nodes = 0; int visible_nodes = 0; bool find_best = false; int i; int r; int node; char nodeConninfo[MAXLEN]; unsigned int uxlogid; unsigned int uxrecoff; char last_wal_standby_applied[MAXLEN]; PGconn *nodeConn = NULL; /* * will get info about until 50 nodes, * which seems to be large enough for most scenarios */ nodeInfo nodes[50]; nodeInfo best_candidate; /* first we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()"); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn)); PQclear(res1); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); exit(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res1, 0, 0)); /* * we sleep the monitor time + one second * we bet it should be enough for other repmgrd to update their own data */ sleep(SLEEP_MONITOR + 1); /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo " " FROM %s.repl_nodes " " WHERE id IN (SELECT standby_node FROM %s.repl_status) " " AND cluster = '%s' " " ORDER BY priority ", repmgr_schema, repmgr_schema, local_options.cluster_name); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res1); PQfinish(myLocalConn); exit(ERR_DB_QUERY); } /* ask for the locations */ for (i = 0; i < PQntuples(res1); i++) { node = atoi(PQgetvalue(res1, i, 0)); /* Initialize on false so if we can't reach this node we know that later */ nodes[i].is_ready = false; strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN); nodeConn = establishDBConnection(nodeConninfo, false); /* if we can't see the node just skip it */ if (PQstatus(nodeConn) != CONNECTION_OK) continue; sqlquery_snprintf(sqlquery, "SELECT repmgr_get_last_standby_location()"); res2 = PQexec(nodeConn, sqlquery); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn)); log_info(_("Connection details: %s\n"), nodeConninfo); PQclear(res2); PQfinish(nodeConn); continue; } visible_nodes++; if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0)); nodes[i].nodeId = node; nodes[i].xlog_location.xlogid = uxlogid; nodes[i].xlog_location.xrecoff = uxrecoff; nodes[i].is_ready = true; PQclear(res2); PQfinish(nodeConn); } PQclear(res1); /* Close the connection to this server */ PQfinish(myLocalConn); /* * total nodes that are registered, include master which is a node but was * not counted because it's not a standby */ total_nodes = i + 1; /* * am i on the group that should keep alive? * if i see less than half of total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); exit(ERR_FAILOVER_FAIL); } /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes - 1; i++) { if (!nodes[i].is_ready) continue; else if (!find_best) { /* start with the first ready node, and then move on to the next one */ best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogPtr */ /* * Nodes are retrieved ordered by priority, so if the current * best candidate is lower or equal to the next node's wal location * then assign next node as the new best candidate. */ if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.nodeId == local_options.node)) { if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else if (find_best) { if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.nodeId); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. * The follow command should take care of that. */ r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); exit(ERR_FAILOVER_FAIL); } /* and reconnect to the local database */ myLocalConn = establishDBConnection(local_options.conninfo, true); }