/* * Returns the replication apply delay in ms */ int GetReplicationApplyDelay(void) { /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; XLogRecPtr receivePtr; XLogRecPtr replayPtr; long secs; int usecs; SpinLockAcquire(&walrcv->mutex); receivePtr = walrcv->receivedUpto; SpinLockRelease(&walrcv->mutex); replayPtr = GetXLogReplayRecPtr(NULL); if (XLByteEQ(receivePtr, replayPtr)) return 0; TimestampDifference(GetCurrentChunkReplayStartTime(), GetCurrentTimestamp(), &secs, &usecs); return (((int) secs * 1000) + (usecs / 1000)); }
/* * Send reply message to primary, indicating our current XLOG positions and * the current time. */ static void XLogWalRcvSendReply(void) { char buf[sizeof(StandbyReplyMessage) + 1]; TimestampTz now; /* * If the user doesn't want status to be reported to the master, be sure * to exit before doing anything at all. */ if (wal_receiver_status_interval <= 0) return; /* Get current timestamp. */ now = GetCurrentTimestamp(); /* * We can compare the write and flush positions to the last message we * sent without taking any lock, but the apply position requires a spin * lock, so we don't check that unless something else has changed or 10 * seconds have passed. This means that the apply log position will * appear, from the master's point of view, to lag slightly, but since * this is only for reporting purposes and only on idle systems, that's * probably OK. */ if (XLByteEQ(reply_message.write, LogstreamResult.Write) && XLByteEQ(reply_message.flush, LogstreamResult.Flush) && !TimestampDifferenceExceeds(reply_message.sendTime, now, wal_receiver_status_interval * 1000)) return; /* Construct a new message. */ reply_message.write = LogstreamResult.Write; reply_message.flush = LogstreamResult.Flush; reply_message.apply = GetXLogReplayRecPtr(); reply_message.sendTime = now; elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X", reply_message.write.xlogid, reply_message.write.xrecoff, reply_message.flush.xlogid, reply_message.flush.xrecoff, reply_message.apply.xlogid, reply_message.apply.xrecoff); /* Prepend with the message type and send it. */ buf[0] = 'r'; memcpy(&buf[1], &reply_message, sizeof(StandbyReplyMessage)); walrcv_send(buf, sizeof(StandbyReplyMessage) + 1); }
static void killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber offset; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(r, so->curbuf); p = (Page) BufferGetPage(so->curbuf); if (XLByteEQ(so->stack->lsn, PageGetLSN(p))) { /* page unchanged, so all is simple */ offset = ItemPointerGetOffsetNumber(iptr); ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); } else { OffsetNumber maxoff = PageGetMaxOffsetNumber(p); for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) { IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset)); if (ItemPointerEquals(&(ituple->t_tid), iptr)) { /* found */ ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); break; } } } LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- }
static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks = 0; SMgrRelation smgr_relation = NULL; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; int ii; XLogRecPtr loc; XLogRecPtr loc1; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; int NumberOfRelations = request->count; FileRepResyncHashEntry_s entry; ChangeTrackingResult *result = NULL; while (1) { /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); if ((result = ChangeTracking_GetChanges(request)) != NULL) { FileRepResync_ResetReadBufferRequest(); for (ii = 0; ii < result->count; ii++) { if (smgr_relation == NULL) { NumberOfRelations--; smgr_relation = smgropen(result->entries[ii].relFileNode); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); numBlocks = smgrnblocks(smgr_relation); if (Debug_filerep_print) elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' " "number of blocks:'%u' ", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode, numBlocks); thresholdCount = Min(numBlocks, 1024); } loc1 = result->entries[ii].lsn_end; /* * if relation was truncated then block_num from change tracking can be beyond numBlocks */ if (result->entries[ii].block_num >= numBlocks) { ereport(LOG, (errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), " "lsn change tracking '%s(%u/%u)' " "number of blocks '%d' ", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); goto flush_check; } /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, result->entries[ii].block_num, relidstr); FileRepResync_ResetReadBufferRequest(); Assert(result->entries[ii].block_num < numBlocks); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if(Debug_filerep_print) { elog(LOG, "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' " "lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page))) { if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) { ereport(LOG, (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, " "lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' " "number of blocks '%d'", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); } /* * It's safe and better to perform write of the page to mirror, * for this case, as primary and mirror data pages should always * be same. So, we might do some extra work but definitely won't * loose out blocks, or error out and need to perform full recovery. * Need to cover for this case as there are some known scenarios where * CT file can have extra records which should have been discarded, * but as we loose out information of xlog LSN cannot be discarded. * One such case is when CT_TRANSIENT being compacted to CT_COMPACT * with specific xlog LSN (to discard extra records) in CT mode gets * interrupted by resync. Compaction during Resync collects all the * CT records and doesn't have xlog LSN information to discard any * extra records from CT_TRANSIENT. */ smgrwrite(smgr_relation, result->entries[ii].block_num, (char *)BufferGetBlock(buf), FALSE); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif UnlockReleaseBuffer(buf); #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FileRepResyncWorker, DDLNotSpecified, "", // databaseName ""); // tableName #endif flush_check: if (((ii + 1) == result->count) || ! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode && result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode && result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode)) { if (result->ask_for_more == false) { smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; FileRep_GetRelationPath( entry.fileName, result->entries[ii].relFileNode, 0 /* segment file number is always 0 for Buffer Pool */); status = FileRepResync_UpdateEntry(&entry); if (status != STATUS_OK) { break; } } } if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } // for (ii = 0; ii < result->count; ii++) } // if ((result = ChangeTracking_GetChanges(request)) != NULL) FileRepResync_ResetReadBufferRequest(); if (result != NULL && result->ask_for_more == true) { Assert(request->count == 1); request->entries[0].lsn_start = result->next_start_lsn; } else { break; } } // while(1) ChangeTracking_FreeRequest(request); ChangeTracking_FreeResult(result); Insist(NumberOfRelations == 0); if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
static void dumpXLogRecord(XLogRecord *record, bool header_only) { uint8 info = record->xl_info & ~XLR_INFO_MASK; /* check if the user wants a specific rmid */ if (rmid>=0 && record->xl_rmid!=rmid) return; if (xid!=InvalidTransactionId && xid!=record->xl_xid) return; #ifdef NOT_USED printf("%u/%08X: prv %u/%08X", curRecPtr.xlogid, curRecPtr.xrecoff, record->xl_prev.xlogid, record->xl_prev.xrecoff); if (!XLByteEQ(record->xl_prev, prevRecPtr)) printf("(?)"); printf("; xid %u; ", record->xl_xid); if (record->xl_rmid <= RM_MAX_ID) printf("%s", RM_names[record->xl_rmid]); else printf("RM %2d", record->xl_rmid); printf(" info %02X len %u tot_len %u\n", record->xl_info, record->xl_len, record->xl_tot_len); #endif if (header_only) { printf(" ** maybe continues to next segment **\n"); return; } /* * See rmgr.h for more details about the built-in resource managers. */ xlogstats.rmgr_count[record->xl_rmid]++; xlogstats.rmgr_len[record->xl_rmid] += record->xl_len; switch (record->xl_rmid) { case RM_XLOG_ID: print_rmgr_xlog(curRecPtr, record, info, hideTimestamps); break; case RM_XACT_ID: print_rmgr_xact(curRecPtr, record, info, hideTimestamps); break; case RM_SMGR_ID: print_rmgr_smgr(curRecPtr, record, info); break; case RM_CLOG_ID: print_rmgr_clog(curRecPtr, record, info); break; case RM_DBASE_ID: print_rmgr_dbase(curRecPtr, record, info); break; case RM_TBLSPC_ID: print_rmgr_tblspc(curRecPtr, record, info); break; case RM_MULTIXACT_ID: print_rmgr_multixact(curRecPtr, record, info); break; #if PG_VERSION_NUM >= 90000 case RM_RELMAP_ID: print_rmgr_relmap(curRecPtr, record, info); break; case RM_STANDBY_ID: print_rmgr_standby(curRecPtr, record, info); break; #endif case RM_HEAP2_ID: print_rmgr_heap2(curRecPtr, record, info); break; case RM_HEAP_ID: print_rmgr_heap(curRecPtr, record, info, statements); break; case RM_BTREE_ID: print_rmgr_btree(curRecPtr, record, info); break; case RM_HASH_ID: print_rmgr_hash(curRecPtr, record, info); break; case RM_GIN_ID: print_rmgr_gin(curRecPtr, record, info); break; case RM_GIST_ID: print_rmgr_gist(curRecPtr, record, info); break; case RM_SEQ_ID: print_rmgr_seq(curRecPtr, record, info); break; default: fprintf(stderr, "Unknown RMID %d.\n", record->xl_rmid); break; } /* * print info about backup blocks. */ print_backup_blocks(curRecPtr, record); }
static void WalSendServerDoRequest(WalSendRequest *walSendRequest) { bool successful; struct timeval standbyTimeout; WalSendServerGetStandbyTimeout(&standbyTimeout); switch (walSendRequest->command) { case PositionToEnd: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "PositionToEnd"); successful = write_position_to_end(&originalEndLocation, NULL, &walsend_shutdown_requested); if (successful) elog(LOG,"Standby master returned transaction log end location %s", XLogLocationToString(&originalEndLocation)); else { disableQDMirroring_ConnectionError( "Unable to connect to standby master and determine transaction log end location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } break; case Catchup: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Catchup"); if (isQDMirroringCatchingUp()) { bool tooFarBehind = false; elog(LOG,"Current master transaction log is flushed through location %s", XLogLocationToString(&walSendRequest->flushedLocation)); if (XLByteLT(originalEndLocation, walSendRequest->flushedLocation)) { /* * Standby master is behind the primary. Send catchup WAL. */ /* * Use a TRY block to catch errors from our attempt to read * the primary's WAL. Errors from sending to the standby * come up as a boolean return (successful). */ PG_TRY(); { successful = XLogCatchupQDMirror( &originalEndLocation, &walSendRequest->flushedLocation, &standbyTimeout, &walsend_shutdown_requested); } PG_CATCH(); { /* * Report the error related to reading the primary's WAL * to the server log */ /* * But first demote the error to something much less * scary. */ if (!elog_demote(WARNING)) { elog(LOG,"unable to demote error"); PG_RE_THROW(); } EmitErrorReport(); FlushErrorState(); successful = false; tooFarBehind = true; } PG_END_TRY(); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "catchup send from standby end %s through primary flushed location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); } } else if (XLByteEQ(originalEndLocation, walSendRequest->flushedLocation)) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Mirror was already caught up"); successful = true; } else { elog(WARNING,"Standby master transaction log location %s is beyond the current master end location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); successful = false; } if (successful) { char detail[200]; int count; count = snprintf( detail, sizeof(detail), "Transaction log copied from locations %s through %s to the standby master", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); if (count >= sizeof(detail)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("format command string failure"))); } enableQDMirroring("Master mirroring is now synchronized", detail); currentEndLocation = walSendRequest->flushedLocation; periodicLen = 0; periodicLocation = currentEndLocation; } else { if (tooFarBehind) { disableQDMirroring_TooFarBehind( "The current master was unable to synchronize the standby master " "because the transaction logs on the current master were recycled. " "A gpinitstandby (at an appropriate time) will be necessary to copy " "over the whole master database to the standby master so it may be synchronized"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost during transaction log catchup", GetStandbyErrorString()); } disconnectMirrorQD_SendClose(); } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not catching-up (state is disabled)"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case WriteWalPages: if (Debug_print_qd_mirroring) elog(LOG, "WriteWalPages"); if (isQDMirroringEnabled()) { char *from; Size nbytes; bool more= false; /* * For now, save copy of data until flush. This could be * optimized. */ if (saveBuffer == NULL) { uint32 totalBufferLen = XLOGbuffers * XLOG_BLCKSZ; saveBuffer = malloc(totalBufferLen); if (saveBuffer == NULL) elog(ERROR,"Could not allocate buffer for xlog data (%d bytes)", totalBufferLen); saveBufferLen = 0; } XLogGetBuffer(walSendRequest->startidx, walSendRequest->npages, &from, &nbytes); if (saveBufferLen == 0) { more = false; writeLogId = walSendRequest->logId; writeLogSeg = walSendRequest->logSeg; writeLogOff = walSendRequest->logOff; memcpy(saveBuffer, from, nbytes); saveBufferLen = nbytes; } else { more = true; memcpy(&saveBuffer[saveBufferLen], from, nbytes); saveBufferLen += nbytes; } if (Debug_print_qd_mirroring) elog(LOG, "Master Mirror Send: WriteWalPages (%s) startidx %d, npages %d, timeLineID %d, logId %u, logSeg %u, logOff 0x%X, nbytes 0x%X", (more ? "more" : "new"), walSendRequest->startidx, walSendRequest->npages, walSendRequest->timeLineID, walSendRequest->logId, walSendRequest->logSeg, walSendRequest->logOff, (int)nbytes); } case FlushWalPages: if (Debug_print_qd_mirroring) elog(LOG, "FlushWalPages"); if (isQDMirroringEnabled()) { char cmd[MAXFNAMELEN + 50]; if (saveBufferLen == 0) successful = true; else { if (snprintf(cmd, sizeof(cmd),"xlog %d %d %d %d", writeLogId, writeLogSeg, writeLogOff, (int)saveBufferLen) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d", writeLogId, writeLogSeg); successful = write_qd_sync(cmd, saveBuffer, saveBufferLen, &standbyTimeout, &walsend_shutdown_requested); if (successful) { XLogRecPtr oldEndLocation; oldEndLocation = currentEndLocation; currentEndLocation.xlogid = writeLogId; currentEndLocation.xrecoff = writeLogSeg * XLogSegSize + writeLogOff; if (currentEndLocation.xrecoff >= XLogFileSize) { (currentEndLocation.xlogid)++; currentEndLocation.xrecoff = 0; } if (XLByteLT(oldEndLocation,currentEndLocation)) { periodicLen += saveBufferLen; if (periodicLen > periodicReportLen) { elog(LOG, "Master mirroring periodic report: %d bytes successfully send to standby master for locations %s through %s", periodicLen, XLogLocationToString(&periodicLocation), XLogLocationToString2(¤tEndLocation)); periodicLen = 0; periodicLocation = currentEndLocation; } } else { if (Debug_print_qd_mirroring) elog(LOG, "Send to Master mirror successful. New end location %s (old %s)", XLogLocationToString(¤tEndLocation), XLogLocationToString2(&oldEndLocation)); } } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new transaction log", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } /* * Reset so WriteWalPages can fill the buffer again. */ saveBufferLen = 0; writeLogId = 0; writeLogSeg = 0; writeLogOff = 0; } if (successful && walSendRequest->haveNewCheckpointLocation) { uint32 logid; uint32 seg; uint32 offset; elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"New previous checkpoint location %s", XLogLocationToString(&walSendRequest->newCheckpointLocation)); XLByteToSeg(walSendRequest->newCheckpointLocation, logid, seg); offset = walSendRequest->newCheckpointLocation.xrecoff % XLogSegSize; if (snprintf(cmd, sizeof(cmd),"new_checkpoint_location %d %d %d", logid, seg, offset) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d offset %d", logid, seg, offset); successful = write_qd_sync(cmd, NULL, 0, NULL, &walsend_shutdown_requested); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Send of new checkpoint location to master mirror successful"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new checkpoint location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not enabled"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case CloseForShutdown: if (Debug_print_qd_mirroring) elog(LOG, "CloseForShutdown"); /* * Do the work we would normally do when signaled to stop. */ WalSendServer_ServiceShutdown(); break; default: elog(ERROR, "Unknown WalSendRequestCommand %d", walSendRequest->command); } }
static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); /* here we don't need to distinguish between split and page update */ if (parent->childoffnum == InvalidOffsetNumber || !XLByteEQ(parent->lsn, PageGetLSN(parent->page))) { /* parent is changed, look child in right links until found */ OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *ptr; while (true) { maxoff = PageGetMaxOffsetNumber(parent->page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(parent->page, i); idxtuple = (IndexTuple) PageGetItem(parent->page, iid); if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) { /* yes!!, found */ parent->childoffnum = i; return; } } parent->blkno = GistPageGetOpaque(parent->page)->rightlink; UnlockReleaseBuffer(parent->buffer); if (parent->blkno == InvalidBlockNumber) /* * end of chain and still didn't found parent, It's very-very * rare situation when root splited */ break; parent->buffer = ReadBuffer(r, parent->blkno); LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); } /* * awful!!, we need search tree to find parent ... , but before we * should release all old parent */ ptr = child->parent->parent; /* child->parent already released * above */ while (ptr) { ReleaseBuffer(ptr->buffer); ptr = ptr->parent; } /* ok, find new path */ ptr = parent = gistFindPath(r, child->blkno); Assert(ptr != NULL); /* read all buffers as expected by caller */ /* note we don't lock them or gistcheckpage them here! */ while (ptr) { ptr->buffer = ReadBuffer(r, ptr->blkno); ptr->page = (Page) BufferGetPage(ptr->buffer); ptr = ptr->parent; } /* install new chain of parents to stack */ child->parent = parent; parent->child = child; /* make recursive call to normal processing */ gistFindCorrectParent(r, child); } return; }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ static void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn.xrecoff = 0; firststack.parent = NULL; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an * exclusive lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to * the parent because the inserting backend crashed before doing * that, fix that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && XLByteLT(stack->parent->lsn, GistPageGetOpaque(stack->page)->nsn)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the * best child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, stack->childoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page * may change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn)) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * gistinserthere() might have to split the page to make the * updated tuple fit. It will adjust the stack so that after * the call, we'll be holding a lock on the page containing * the tuple, which might have moved right. * * Except if this causes a root split, gistinserthere() * returns 'true'. In that case, stack only holds the new * root, and the child page was released. Have to start * all over. */ if (gistinserttuples(&state, stack, giststate, &newtup, 1, stack->childoffnum, InvalidBuffer)) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page * if it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page * may change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf * is the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || XLByteLT(stack->parent->lsn, GistPageGetOpaque(stack->page)->nsn)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuples(&state, stack, giststate, &itup, 1, InvalidOffsetNumber, InvalidBuffer); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
void cdb_perform_redo(XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint, XLogRecPtr *newCheckpointLoc) { CheckPoint oldRedoCheckpoint; uint32 logid; uint32 seg; int nsegsremoved; if (redoCheckPointLoc->xlogid == 0 && redoCheckPointLoc->xrecoff == 0) { XLogGetRecoveryStart("QDSYNC", "for redo apply", redoCheckPointLoc, redoCheckPoint); } XLogStandbyRecoverRange(redoCheckPointLoc, redoCheckPoint, newCheckpointLoc); /* * Sample the recovery start location now to see if appling redo * processed checkpoint records and moved the restart location forward. */ oldRedoCheckpoint = *redoCheckPoint; XLogGetRecoveryStart("QDSYNC", "for redo progress check", redoCheckPointLoc, redoCheckPoint); if (XLByteLT(oldRedoCheckpoint.redo,redoCheckPoint->redo)) { ereport(LOG, (errmsg("QDSYNC: transaction redo moved the restart location from %s to %s", XLogLocationToString(&oldRedoCheckpoint.redo), XLogLocationToString2(&redoCheckPoint->redo)))); } else { Assert(XLByteEQ(oldRedoCheckpoint.redo,redoCheckPoint->redo)); ereport(LOG, (errmsg("QDSYNC: transaction redo did not move the restart location %s forward this pass", XLogLocationToString(&oldRedoCheckpoint.redo)))); return; } XLByteToSeg(redoCheckPoint->redo, logid, seg); /* * Delete offline log files (those no longer needed even for previous * checkpoint). */ elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: keep log files as far back as (logid %d, seg %d)", logid, seg); if (logid || seg) { PrevLogSeg(logid, seg); elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: delete offline log files up to (logid %d, seg %d)", logid, seg); XLogRemoveStandbyLogs(logid, seg, &nsegsremoved); if (nsegsremoved > 0) { // Throw in extra new line to make log more readable. ereport(LOG, (errmsg("QDSYNC: %d logs removed through logid %d, seg %d\n", nsegsremoved, logid, seg))); } } // Throw in extra new line to make log more readable. elog(LOG,"--------------------------"); }
/* * Update the LSNs on each queue based upon our latest state. This * implements a simple policy of first-valid-standby-releases-waiter. * * Other policies are possible, which would change what we do here and what * perhaps also which information we store as well. */ void SyncRepReleaseWaiters(void) { volatile WalSndCtlData *walsndctl = WalSndCtl; volatile WalSnd *syncWalSnd = NULL; int numwrite = 0; int numflush = 0; int priority = 0; int i; /* * If this WALSender is serving a standby that is not on the list of * potential standbys then we have nothing to do. If we are still starting * up, still running base backup or the current flush position is still * invalid, then leave quickly also. */ if (MyWalSnd->sync_standby_priority == 0 || MyWalSnd->state < WALSNDSTATE_STREAMING || XLByteEQ(MyWalSnd->flush, InvalidXLogRecPtr)) return; /* * We're a potential sync standby. Release waiters if we are the highest * priority standby. If there are multiple standbys with same priorities * then we use the first mentioned standby. If you change this, also * change pg_stat_get_wal_senders(). */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &walsndctl->walsnds[i]; if (walsnd->pid != 0 && walsnd->state == WALSNDSTATE_STREAMING && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority) && !XLByteEQ(walsnd->flush, InvalidXLogRecPtr)) { priority = walsnd->sync_standby_priority; syncWalSnd = walsnd; } } /* * We should have found ourselves at least. */ Assert(syncWalSnd); /* * If we aren't managing the highest priority standby then just leave. */ if (syncWalSnd != MyWalSnd) { LWLockRelease(SyncRepLock); announce_next_takeover = true; return; } /* * Set the lsn first so that when we wake backends they will release up to * this location. */ if (XLByteLT(walsndctl->lsn[SYNC_REP_WAIT_WRITE], MyWalSnd->write)) { walsndctl->lsn[SYNC_REP_WAIT_WRITE] = MyWalSnd->write; numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE); } if (XLByteLT(walsndctl->lsn[SYNC_REP_WAIT_FLUSH], MyWalSnd->flush)) { walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = MyWalSnd->flush; numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH); } LWLockRelease(SyncRepLock); elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X", numwrite, MyWalSnd->write.xlogid, MyWalSnd->write.xrecoff, numflush, MyWalSnd->flush.xlogid, MyWalSnd->flush.xrecoff); /* * If we are managing the highest priority standby, though we weren't * prior to this, then announce we are now the sync standby. */ if (announce_next_takeover) { announce_next_takeover = false; ereport(LOG, (errmsg("standby \"%s\" is now the synchronous standby with priority %u", application_name, MyWalSnd->sync_standby_priority))); } }