/* * Replay the clearing of F_FOLLOW_RIGHT flag. */ static void gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn, BlockNumber leftblkno) { Buffer buffer; buffer = XLogReadBuffer(node, leftblkno, false); if (BufferIsValid(buffer)) { Page page = (Page) BufferGetPage(buffer); /* * Note that we still update the page even if page LSN is equal to the * LSN of this record, because the updated NSN is not included in the * full page image. */ if (!XLByteLT(lsn, PageGetLSN(page))) { GistPageGetOpaque(page)->nsn = lsn; GistClearFollowRight(page); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } }
/* * Insert MyProc into SyncRepQueue, maintaining sorted invariant. * * Usually we will go at tail of queue, though it's possible that we arrive * here out of order, so start at tail and work back to insertion point. */ static void SyncRepQueueInsert(void) { PGPROC *proc; proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue), &(WalSndCtl->SyncRepQueue), offsetof(PGPROC, syncRepLinks)); while (proc) { /* * Stop at the queue element that we should after to * ensure the queue is ordered by LSN. */ if (XLByteLT(proc->waitLSN, MyProc->waitLSN)) break; proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue), &(proc->syncRepLinks), offsetof(PGPROC, syncRepLinks)); } if (proc) SHMQueueInsertAfter(&(proc->syncRepLinks), &(MyProc->syncRepLinks)); else SHMQueueInsertAfter(&(WalSndCtl->SyncRepQueue), &(MyProc->syncRepLinks)); }
/* * GetUndoRecPtr -- returns oldest PGPROC->logRec. */ XLogRecPtr GetUndoRecPtr(void) { SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; XLogRecPtr urec = {0, 0}; XLogRecPtr tempr; int index; LWLockAcquire(SInvalLock, LW_SHARED); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); tempr = proc->logRec; if (tempr.xrecoff == 0) continue; if (urec.xrecoff != 0 && XLByteLT(urec, tempr)) continue; urec = tempr; } } LWLockRelease(SInvalLock); return (urec); }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the heap page. The LSN of the visibility map page is * advanced to that, to make sure that the visibility map doesn't get flushed * to disk before the update to the heap page that made all tuples visible. * * This is an opportunistic function. It does nothing, unless *buf * contains the bit for heapBlk. Call visibilitymap_pin first to pin * the right map page. This function doesn't do any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Check that we have the right page pinned */ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock) return; page = BufferGetPage(*buf); map = PageGetContents(page); LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { map[mapByte] |= (1 << mapBit); if (XLByteLT(PageGetLSN(page), recptr)) PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(*buf); } LockBuffer(*buf, BUFFER_LOCK_UNLOCK); }
/* * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant. * * Usually we will go at tail of queue, though it's possible that we arrive * here out of order, so start at tail and work back to insertion point. */ static void SyncRepQueueInsert(int mode) { PGPROC *proc; Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]), &(WalSndCtl->SyncRepQueue[mode]), offsetof(PGPROC, syncRepLinks)); while (proc) { /* * Stop at the queue element that we should after to ensure the queue * is ordered by LSN. */ if (XLByteLT(proc->waitLSN, MyProc->waitLSN)) break; proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]), &(proc->syncRepLinks), offsetof(PGPROC, syncRepLinks)); } if (proc) SHMQueueInsertAfter(&(proc->syncRepLinks), &(MyProc->syncRepLinks)); else SHMQueueInsertAfter(&(WalSndCtl->SyncRepQueue[mode]), &(MyProc->syncRepLinks)); }
/* * Save a new checkpoint location that the QD mirror can safely reply * through and release any XLOG files freed up by reply. * * The location will be saved and sent to the QD mirror on the WAL send * request. */ void FtsQDMirroringNewCheckpointLoc(XLogRecPtr *newCheckpointLocation) { Assert(newCheckpointLocation != NULL); LWLockAcquire(ftsQDMirrorLock, LW_EXCLUSIVE); if (ftsQDMirrorInfo->state != QDMIRROR_STATE_SYNCHRONIZED) { LWLockRelease(ftsQDMirrorLock); return; } if (ftsQDMirrorInfo->haveNewCheckpointLocation && XLByteLT(*newCheckpointLocation, ftsQDMirrorInfo->newCheckpointLocation)) { LWLockRelease(ftsQDMirrorLock); return; } ftsQDMirrorInfo->haveNewCheckpointLocation = true; ftsQDMirrorInfo->newCheckpointLocation = *newCheckpointLocation; LWLockRelease(ftsQDMirrorLock); }
/* Flush the log to disk */ static void XLogWalRcvFlush(void) { if (XLByteLT(LogstreamResult.Flush, LogstreamResult.Write)) { /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; issue_xlog_fsync(recvFile, recvId, recvSeg); LogstreamResult.Flush = LogstreamResult.Write; /* Update shared-memory status */ SpinLockAcquire(&walrcv->mutex); walrcv->latestChunkStart = walrcv->receivedUpto; walrcv->receivedUpto = LogstreamResult.Flush; SpinLockRelease(&walrcv->mutex); /* Report XLOG streaming progress in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", LogstreamResult.Write.xlogid, LogstreamResult.Write.xrecoff); set_ps_display(activitymsg, false); } } }
/* * Returns the oldest Send position among walsenders. Or InvalidXLogRecPtr * if none. */ XLogRecPtr GetOldestWALSendPointer(void) { XLogRecPtr oldest = {0, 0}; int i; bool found = false; for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; XLogRecPtr recptr; if (walsnd->pid == 0) continue; SpinLockAcquire(&walsnd->mutex); recptr = walsnd->sentPtr; SpinLockRelease(&walsnd->mutex); if (recptr.xlogid == 0 && recptr.xrecoff == 0) continue; if (!found || XLByteLT(recptr, oldest)) oldest = recptr; found = true; } return oldest; }
/* * Flush the log to disk. * * If we're in the midst of dying, it's unwise to do anything that might throw * an error, so we skip sending a reply in that case. */ static void XLogWalRcvFlush(bool dying) { if (XLByteLT(LogstreamResult.Flush, LogstreamResult.Write)) { /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; issue_xlog_fsync(recvFile, recvId, recvSeg); LogstreamResult.Flush = LogstreamResult.Write; /* Update shared-memory status */ SpinLockAcquire(&walrcv->mutex); if (XLByteLT(walrcv->receivedUpto, LogstreamResult.Flush)) { walrcv->latestChunkStart = walrcv->receivedUpto; walrcv->receivedUpto = LogstreamResult.Flush; } SpinLockRelease(&walrcv->mutex); /* Signal the startup process and walsender that new WAL has arrived */ WakeupRecovery(); if (AllowCascadeReplication()) WalSndWakeup(); /* Report XLOG streaming progress in PS display */ if (update_process_title) { char activitymsg[50]; snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", LogstreamResult.Write.xlogid, LogstreamResult.Write.xrecoff); set_ps_display(activitymsg, false); } /* Also let the master know that we made some progress */ if (!dying) { XLogWalRcvSendReply(); XLogWalRcvSendHSFeedback(); } } }
static void bitmap_xlog_insert_bitmap_lastwords(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_bitmap_lastwords *xlrec = (xl_bm_bitmap_lastwords*) XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); if (!RelationIsValid(reln)) return; if (redo) { Buffer lovBuffer; Page lovPage; BMLOVItem lovItem; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_bitmap_lastwords: redo=%d\n", redo))); #endif lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno); if (!BufferIsValid(lovBuffer)) elog(PANIC, "bm_insert_redo: block unfound: %d", xlrec->bm_lov_blkno); lovPage = BufferGetPage(lovBuffer); if (XLByteLT(PageGetLSN(lovPage), lsn)) { lovItem = (BMLOVItem) PageGetItem(lovPage, PageGetItemId(lovPage, xlrec->bm_lov_offset)); lovItem->bm_last_compword = xlrec->bm_last_compword; lovItem->bm_last_word = xlrec->bm_last_word; lovItem->bm_last_two_headerbits = xlrec->bm_last_two_headerbits; PageSetLSN(lovPage, lsn); PageSetTLI(lovPage, ThisTimeLineID); _bitmap_wrtbuf(lovBuffer); } else _bitmap_relbuf(lovBuffer); } else elog(PANIC, "bm_insert_undo: not implemented."); }
static void pushStackIfSplited(Page page, GistBDItem *stack) { GISTPageOpaque opaque = GistPageGetOpaque(page); if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) && XLByteLT(stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* split page detected, install right link to the stack */ GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = opaque->rightlink; ptr->parentlsn = stack->parentlsn; ptr->next = stack->next; stack->next = ptr; } }
static void bitmap_xlog_insert_meta(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_metapage *xlrec = (xl_bm_metapage*) XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); if (!RelationIsValid(reln)) return; if (redo) { Buffer metabuf; BMMetaPage metapage; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_meta: redo=%d\n", redo))); #endif metabuf = XLogReadBuffer(false, reln, BM_METAPAGE); if (!BufferIsValid(metabuf)) elog(PANIC, "bm_insert_redo: block unfound: %d", BM_METAPAGE); /* restore the page */ metapage = (BMMetaPage)BufferGetPage(metabuf); if (XLByteLT(PageGetLSN(metapage), lsn)) { PageSetLSN(metapage, lsn); PageSetTLI(metapage, ThisTimeLineID); _bitmap_wrtbuf(metabuf); } else _bitmap_relbuf(metabuf); } else elog(PANIC, "bm_insert_undo: not implemented."); }
/* * Scan all items on the GiST index page identified by *pageItem, and insert * them into the queue (or directly to output areas) * * scan: index scan we are executing * pageItem: search queue item identifying an index page to scan * myDistances: distances array associated with pageItem, or NULL at the root * tbm: if not NULL, gistgetbitmap's output bitmap * ntids: if not NULL, gistgetbitmap's output tuple counter * * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap * tuples should be reported directly into the bitmap. If they are NULL, * we're doing a plain or ordered indexscan. For a plain indexscan, heap * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, * heap tuple TIDs are pushed into individual search queue items. * * If we detect that the index page has split since we saw its downlink * in the parent, we push its new right sibling onto the queue so the * sibling will be processed next. */ static void gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, TIDBitmap *tbm, int64 *ntids) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; Buffer buffer; Page page; GISTPageOpaque opaque; OffsetNumber maxoff; OffsetNumber i; GISTSearchTreeItem *tmpItem = so->tmpTreeItem; bool isNew; MemoryContext oldcxt; Assert(!GISTSearchItemIsHeap(*pageItem)); buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); opaque = GistPageGetOpaque(page); /* * Check if we need to follow the rightlink. We need to follow it if the * page was concurrently split since we visited the parent (in which case * parentlsn < nsn), or if the the system crashed after a page split but * before the downlink was inserted into the parent. */ if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && (GistFollowRight(page) || XLByteLT(pageItem->data.parentlsn, opaque->nsn)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* There was a page split, follow right link to add pages */ GISTSearchItem *item; /* This can't happen when starting at the root */ Assert(myDistances != NULL); oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for the right sibling index page */ item = palloc(sizeof(GISTSearchItem)); item->next = NULL; item->blkno = opaque->rightlink; item->data.parentlsn = pageItem->data.parentlsn; /* Insert it into the queue using same distances as for this page */ tmpItem->head = item; tmpItem->lastHeap = NULL; memcpy(tmpItem->distances, myDistances, sizeof(double) * scan->numberOfOrderBys); (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); MemoryContextSwitchTo(oldcxt); } so->nPageData = so->curPageData = 0; /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); bool match; bool recheck; /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. */ oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); match = gistindex_keytest(scan, it, page, i, &recheck); MemoryContextSwitchTo(oldcxt); MemoryContextReset(so->giststate->tempCxt); /* Ignore tuple if it doesn't match */ if (!match) continue; if (tbm && GistPageIsLeaf(page)) { /* * getbitmap scan, so just push heap tuple TIDs into the bitmap * without worrying about ordering */ tbm_add_tuples(tbm, &it->t_tid, 1, recheck); (*ntids)++; } else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* * Non-ordered scan, so report heap tuples in so->pageData[] */ so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; so->nPageData++; } else { /* * Must push item into search queue. We get here for any lower * index page, and also for heap tuples if doing an ordered * search. */ GISTSearchItem *item; oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for this item */ item = palloc(sizeof(GISTSearchItem)); item->next = NULL; if (GistPageIsLeaf(page)) { /* Creating heap-tuple GISTSearchItem */ item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; } else { /* Creating index-page GISTSearchItem */ item->blkno = ItemPointerGetBlockNumber(&it->t_tid); /* lsn of current page is lsn of parent page for child */ item->data.parentlsn = PageGetLSN(page); } /* Insert it into the queue using new distance data */ tmpItem->head = item; tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL; memcpy(tmpItem->distances, so->distances, sizeof(double) * scan->numberOfOrderBys); (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); MemoryContextSwitchTo(oldcxt); } } UnlockReleaseBuffer(buffer); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; struct timeval tv; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (XLByteLT(max_lsn, this_lsn)) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } #ifdef XP_TRACE_LRU_WRITE gettimeofday(&tv, NULL); ereport(TRACE_LEVEL, (errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s", tv.tv_sec, tv.tv_usec, path))); #endif /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
void cdb_perform_redo(XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint, XLogRecPtr *newCheckpointLoc) { CheckPoint oldRedoCheckpoint; uint32 logid; uint32 seg; int nsegsremoved; if (redoCheckPointLoc->xlogid == 0 && redoCheckPointLoc->xrecoff == 0) { XLogGetRecoveryStart("QDSYNC", "for redo apply", redoCheckPointLoc, redoCheckPoint); } XLogStandbyRecoverRange(redoCheckPointLoc, redoCheckPoint, newCheckpointLoc); /* * Sample the recovery start location now to see if appling redo * processed checkpoint records and moved the restart location forward. */ oldRedoCheckpoint = *redoCheckPoint; XLogGetRecoveryStart("QDSYNC", "for redo progress check", redoCheckPointLoc, redoCheckPoint); if (XLByteLT(oldRedoCheckpoint.redo,redoCheckPoint->redo)) { ereport(LOG, (errmsg("QDSYNC: transaction redo moved the restart location from %s to %s", XLogLocationToString(&oldRedoCheckpoint.redo), XLogLocationToString2(&redoCheckPoint->redo)))); } else { Assert(XLByteEQ(oldRedoCheckpoint.redo,redoCheckPoint->redo)); ereport(LOG, (errmsg("QDSYNC: transaction redo did not move the restart location %s forward this pass", XLogLocationToString(&oldRedoCheckpoint.redo)))); return; } XLByteToSeg(redoCheckPoint->redo, logid, seg); /* * Delete offline log files (those no longer needed even for previous * checkpoint). */ elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: keep log files as far back as (logid %d, seg %d)", logid, seg); if (logid || seg) { PrevLogSeg(logid, seg); elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "QDSYNC: delete offline log files up to (logid %d, seg %d)", logid, seg); XLogRemoveStandbyLogs(logid, seg, &nsegsremoved); if (nsegsremoved > 0) { // Throw in extra new line to make log more readable. ereport(LOG, (errmsg("QDSYNC: %d logs removed through logid %d, seg %d\n", nsegsremoved, logid, seg))); } } // Throw in extra new line to make log more readable. elog(LOG,"--------------------------"); }
static void WalSendServerDoRequest(WalSendRequest *walSendRequest) { bool successful; struct timeval standbyTimeout; WalSendServerGetStandbyTimeout(&standbyTimeout); switch (walSendRequest->command) { case PositionToEnd: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "PositionToEnd"); successful = write_position_to_end(&originalEndLocation, NULL, &walsend_shutdown_requested); if (successful) elog(LOG,"Standby master returned transaction log end location %s", XLogLocationToString(&originalEndLocation)); else { disableQDMirroring_ConnectionError( "Unable to connect to standby master and determine transaction log end location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } break; case Catchup: elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Catchup"); if (isQDMirroringCatchingUp()) { bool tooFarBehind = false; elog(LOG,"Current master transaction log is flushed through location %s", XLogLocationToString(&walSendRequest->flushedLocation)); if (XLByteLT(originalEndLocation, walSendRequest->flushedLocation)) { /* * Standby master is behind the primary. Send catchup WAL. */ /* * Use a TRY block to catch errors from our attempt to read * the primary's WAL. Errors from sending to the standby * come up as a boolean return (successful). */ PG_TRY(); { successful = XLogCatchupQDMirror( &originalEndLocation, &walSendRequest->flushedLocation, &standbyTimeout, &walsend_shutdown_requested); } PG_CATCH(); { /* * Report the error related to reading the primary's WAL * to the server log */ /* * But first demote the error to something much less * scary. */ if (!elog_demote(WARNING)) { elog(LOG,"unable to demote error"); PG_RE_THROW(); } EmitErrorReport(); FlushErrorState(); successful = false; tooFarBehind = true; } PG_END_TRY(); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "catchup send from standby end %s through primary flushed location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); } } else if (XLByteEQ(originalEndLocation, walSendRequest->flushedLocation)) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Mirror was already caught up"); successful = true; } else { elog(WARNING,"Standby master transaction log location %s is beyond the current master end location %s", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); successful = false; } if (successful) { char detail[200]; int count; count = snprintf( detail, sizeof(detail), "Transaction log copied from locations %s through %s to the standby master", XLogLocationToString(&originalEndLocation), XLogLocationToString2(&walSendRequest->flushedLocation)); if (count >= sizeof(detail)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("format command string failure"))); } enableQDMirroring("Master mirroring is now synchronized", detail); currentEndLocation = walSendRequest->flushedLocation; periodicLen = 0; periodicLocation = currentEndLocation; } else { if (tooFarBehind) { disableQDMirroring_TooFarBehind( "The current master was unable to synchronize the standby master " "because the transaction logs on the current master were recycled. " "A gpinitstandby (at an appropriate time) will be necessary to copy " "over the whole master database to the standby master so it may be synchronized"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost during transaction log catchup", GetStandbyErrorString()); } disconnectMirrorQD_SendClose(); } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not catching-up (state is disabled)"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case WriteWalPages: if (Debug_print_qd_mirroring) elog(LOG, "WriteWalPages"); if (isQDMirroringEnabled()) { char *from; Size nbytes; bool more= false; /* * For now, save copy of data until flush. This could be * optimized. */ if (saveBuffer == NULL) { uint32 totalBufferLen = XLOGbuffers * XLOG_BLCKSZ; saveBuffer = malloc(totalBufferLen); if (saveBuffer == NULL) elog(ERROR,"Could not allocate buffer for xlog data (%d bytes)", totalBufferLen); saveBufferLen = 0; } XLogGetBuffer(walSendRequest->startidx, walSendRequest->npages, &from, &nbytes); if (saveBufferLen == 0) { more = false; writeLogId = walSendRequest->logId; writeLogSeg = walSendRequest->logSeg; writeLogOff = walSendRequest->logOff; memcpy(saveBuffer, from, nbytes); saveBufferLen = nbytes; } else { more = true; memcpy(&saveBuffer[saveBufferLen], from, nbytes); saveBufferLen += nbytes; } if (Debug_print_qd_mirroring) elog(LOG, "Master Mirror Send: WriteWalPages (%s) startidx %d, npages %d, timeLineID %d, logId %u, logSeg %u, logOff 0x%X, nbytes 0x%X", (more ? "more" : "new"), walSendRequest->startidx, walSendRequest->npages, walSendRequest->timeLineID, walSendRequest->logId, walSendRequest->logSeg, walSendRequest->logOff, (int)nbytes); } case FlushWalPages: if (Debug_print_qd_mirroring) elog(LOG, "FlushWalPages"); if (isQDMirroringEnabled()) { char cmd[MAXFNAMELEN + 50]; if (saveBufferLen == 0) successful = true; else { if (snprintf(cmd, sizeof(cmd),"xlog %d %d %d %d", writeLogId, writeLogSeg, writeLogOff, (int)saveBufferLen) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d", writeLogId, writeLogSeg); successful = write_qd_sync(cmd, saveBuffer, saveBufferLen, &standbyTimeout, &walsend_shutdown_requested); if (successful) { XLogRecPtr oldEndLocation; oldEndLocation = currentEndLocation; currentEndLocation.xlogid = writeLogId; currentEndLocation.xrecoff = writeLogSeg * XLogSegSize + writeLogOff; if (currentEndLocation.xrecoff >= XLogFileSize) { (currentEndLocation.xlogid)++; currentEndLocation.xrecoff = 0; } if (XLByteLT(oldEndLocation,currentEndLocation)) { periodicLen += saveBufferLen; if (periodicLen > periodicReportLen) { elog(LOG, "Master mirroring periodic report: %d bytes successfully send to standby master for locations %s through %s", periodicLen, XLogLocationToString(&periodicLocation), XLogLocationToString2(¤tEndLocation)); periodicLen = 0; periodicLocation = currentEndLocation; } } else { if (Debug_print_qd_mirroring) elog(LOG, "Send to Master mirror successful. New end location %s (old %s)", XLogLocationToString(¤tEndLocation), XLogLocationToString2(&oldEndLocation)); } } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new transaction log", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } /* * Reset so WriteWalPages can fill the buffer again. */ saveBufferLen = 0; writeLogId = 0; writeLogSeg = 0; writeLogOff = 0; } if (successful && walSendRequest->haveNewCheckpointLocation) { uint32 logid; uint32 seg; uint32 offset; elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"New previous checkpoint location %s", XLogLocationToString(&walSendRequest->newCheckpointLocation)); XLByteToSeg(walSendRequest->newCheckpointLocation, logid, seg); offset = walSendRequest->newCheckpointLocation.xrecoff % XLogSegSize; if (snprintf(cmd, sizeof(cmd),"new_checkpoint_location %d %d %d", logid, seg, offset) >= sizeof(cmd)) elog(ERROR,"could not create cmd for qd mirror logid %d seg %d offset %d", logid, seg, offset); successful = write_qd_sync(cmd, NULL, 0, NULL, &walsend_shutdown_requested); if (successful) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Send of new checkpoint location to master mirror successful"); } else { disableQDMirroring_ConnectionError( "Connection to the standby master was lost attempting to send new checkpoint location", GetStandbyErrorString()); disconnectMirrorQD_SendClose(); } } } else if (isQDMirroringDisabled()) { elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not enabled"); } else { elog(ERROR,"unexpected master mirroring state %s", QDMirroringStateString()); } break; case CloseForShutdown: if (Debug_print_qd_mirroring) elog(LOG, "CloseForShutdown"); /* * Do the work we would normally do when signaled to stop. */ WalSendServer_ServiceShutdown(); break; default: elog(ERROR, "Unknown WalSendRequestCommand %d", walSendRequest->command); } }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns from the beginning of closest parent; * * To prevent deadlocks, this should lock only one page simultaneously. */ GISTInsertStack * gistFindPath(Relation r, BlockNumber child) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *top, *tail, *ptr; BlockNumber blkno; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; top = tail = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; while (top && top->blkno != child) { buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* we can safety go away, follows only leaf pages */ UnlockReleaseBuffer(buffer); return NULL; } top->lsn = PageGetLSN(page); if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* page splited while we thinking of... */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->childoffnum = InvalidOffsetNumber; ptr->parent = top; ptr->next = NULL; tail->next = ptr; tail = ptr; } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { OffsetNumber poff = InvalidOffsetNumber; /* make childs links */ ptr = top; while (ptr->parent) { /* set child link */ ptr->parent->child = ptr; /* move childoffnum.. */ if (ptr == top) { /* first iteration */ poff = ptr->parent->childoffnum; ptr->parent->childoffnum = ptr->childoffnum; } else { OffsetNumber tmp = ptr->parent->childoffnum; ptr->parent->childoffnum = poff; poff = tmp; } ptr = ptr->parent; } top->childoffnum = i; UnlockReleaseBuffer(buffer); return top; } else { /* Install next inner page to the end of stack */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->childoffnum = i; /* set offsetnumber of child to child * !!! */ ptr->parent = top; ptr->next = NULL; tail->next = ptr; tail = ptr; } } UnlockReleaseBuffer(buffer); top = top->next; } return NULL; }
static void bitmap_xlog_insert_lovmeta(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_lovmetapage *xlrec = (xl_bm_lovmetapage*)XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); /* reln = XLogOpenRelation(redo, RM_BITMAP_ID, xlrec->bm_node);*/ if (!RelationIsValid(reln)) return; if (redo) { Buffer lovMetabuf; Page lovMetapage; BMLOVMetaItem copyMetaItems, metaItems; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_lovmeta: redo=%d\n", redo))); #endif lovMetabuf = XLogReadBuffer(false, reln, BM_LOV_STARTPAGE-1); if (!BufferIsValid(lovMetabuf)) elog(PANIC, "bm_insert_redo: block unfound: %d -- at (%d,%d,%d)", BM_LOV_STARTPAGE-1, xlrec->bm_node.spcNode, xlrec->bm_node.dbNode, xlrec->bm_node.relNode); lovMetapage = BufferGetPage(lovMetabuf); if (XLByteLT(PageGetLSN(lovMetapage), lsn)) { #ifdef BM_DEBUG uint32 attno; #endif copyMetaItems = (BMLOVMetaItem)PageGetContents(lovMetapage); metaItems = (BMLOVMetaItem) ((char*)xlrec + sizeof(xl_bm_lovmetapage)); memcpy(copyMetaItems, metaItems, xlrec->bm_num_of_attrs * sizeof(BMLOVMetaItemData)); #ifdef BM_DEBUG for(attno=0; attno<xlrec->bm_num_of_attrs; attno++) elog(LOG, "metaItems=%d, %d, %d", copyMetaItems[attno].bm_lov_heapId, copyMetaItems[attno].bm_lov_indexId, copyMetaItems[attno].bm_lov_lastpage); #endif PageSetLSN(lovMetapage, lsn); PageSetTLI(lovMetapage, ThisTimeLineID); _bitmap_wrtbuf(lovMetabuf); } else _bitmap_relbuf(lovMetabuf); } else elog(PANIC, "bm_insert_undo: not implemented."); }
static void bitmap_xlog_insert_lovitem(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_lovitem *xlrec = (xl_bm_lovitem*) XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); if (!RelationIsValid(reln)) return; if (redo) { Buffer lovBuffer; Page lovPage; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_lovitem: redo=%d, blkno=%d\n", redo, xlrec->bm_lov_blkno))); #endif lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno); if (!BufferIsValid(lovBuffer)) elog(PANIC, "bm_insert_redo: block unfound: %d", xlrec->bm_lov_blkno); lovPage = BufferGetPage(lovBuffer); if (XLByteLT(PageGetLSN(lovPage), lsn)) { if(xlrec->bm_isNewItem) { OffsetNumber newOffset, itemSize; newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(lovPage)); if (newOffset != xlrec->bm_lov_offset) elog(PANIC, "bm_insert_redo: LOV item is not inserted in pos %d(requested %d)", newOffset, xlrec->bm_lov_offset); itemSize = sizeof(BMLOVItemData); if (itemSize > PageGetFreeSpace(lovPage)) elog(PANIC, "bm_insert_redo: not enough space in LOV page %d", xlrec->bm_lov_blkno); if (PageAddItem(lovPage, (Item)&(xlrec->bm_lovItem), itemSize, newOffset, LP_USED) == InvalidOffsetNumber) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to add LOV item to \"%s\"", RelationGetRelationName(reln)))); } else{ BMLOVItem oldLovItem; oldLovItem = (BMLOVItem) PageGetItem(lovPage, PageGetItemId(lovPage, xlrec->bm_lov_offset)); memcpy(oldLovItem, &(xlrec->bm_lovItem), sizeof(BMLOVItemData)); } PageSetLSN(lovPage, lsn); PageSetTLI(lovPage, ThisTimeLineID); _bitmap_wrtbuf(lovBuffer); } else { _bitmap_relbuf(lovBuffer); } } else elog(PANIC, "bm_insert_undo: not implemented."); }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ static void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn.xrecoff = 0; firststack.parent = NULL; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an * exclusive lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to * the parent because the inserting backend crashed before doing * that, fix that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && XLByteLT(stack->parent->lsn, GistPageGetOpaque(stack->page)->nsn)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the * best child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, stack->childoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page * may change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn)) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * gistinserthere() might have to split the page to make the * updated tuple fit. It will adjust the stack so that after * the call, we'll be holding a lock on the page containing * the tuple, which might have moved right. * * Except if this causes a root split, gistinserthere() * returns 'true'. In that case, stack only holds the new * root, and the child page was released. Have to start * all over. */ if (gistinserttuples(&state, stack, giststate, &newtup, 1, stack->childoffnum, InvalidBuffer)) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page * if it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page * may change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf * is the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || XLByteLT(stack->parent->lsn, GistPageGetOpaque(stack->page)->nsn)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuples(&state, stack, giststate, &itup, 1, InvalidOffsetNumber, InvalidBuffer); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
static void do_failover(void) { PGresult *res; char sqlquery[QUERY_STR_LEN]; int total_nodes = 0; int visible_nodes = 0; int ready_nodes = 0; bool find_best = false; int i; int r; uint32 uxlogid; uint32 uxrecoff; XLogRecPtr xlog_recptr; char last_wal_standby_applied[MAXLEN]; PGconn *node_conn = NULL; /* * will get info about until 50 nodes, which seems to be large enough for * most scenarios */ t_node_info nodes[50]; /* initialize to keep compiler quiet */ t_node_info best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false}; /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo, witness " " FROM %s.repl_nodes " " WHERE cluster = '%s' " " ORDER BY priority, id ", repmgr_schema, local_options.cluster_name); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); terminate(ERR_DB_QUERY); } /* * total nodes that are registered */ total_nodes = PQntuples(res); log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes); /* * Build an array with the nodes and indicate which ones are visible and * ready */ for (i = 0; i < total_nodes; i++) { nodes[i].node_id = atoi(PQgetvalue(res, i, 0)); strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXLEN); nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false; /* * Initialize on false so if we can't reach this node we know that * later */ nodes[i].is_visible = false; nodes[i].is_ready = false; XLAssignValue(nodes[i].xlog_location, 0, 0); log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"), progname, nodes[i].node_id, nodes[i].conninfo_str, (nodes[i].is_witness) ? "true" : "false"); node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* if we can't see the node just skip it */ if (PQstatus(node_conn) != CONNECTION_OK) { if (node_conn != NULL) PQfinish(node_conn); continue; } visible_nodes++; nodes[i].is_visible = true; PQfinish(node_conn); } PQclear(res); log_debug(_("Total nodes counted: registered=%d, visible=%d\n"), total_nodes, visible_nodes); /* * am i on the group that should keep alive? if i see less than half of * total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); terminate(ERR_FAILOVER_FAIL); } /* Query all the nodes to determine which ones are ready */ for (i = 0; i < total_nodes; i++) { /* if the node is not visible, skip it */ if (!nodes[i].is_visible) continue; if (nodes[i].is_witness) continue; node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* * XXX This shouldn't happen, if this happens it means this is a major * problem maybe network outages? anyway, is better for a human to * react */ if (PQstatus(node_conn) != CONNECTION_OK) { log_err(_("It seems new problems are arising, manual intervention is needed\n")); terminate(ERR_FAILOVER_FAIL); } uxlogid = 0; uxrecoff = 0; sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()"); res = PQexec(node_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(node_conn)); log_info(_("Connection details: %s\n"), nodes[i].conninfo_str); PQclear(res); PQfinish(node_conn); terminate(ERR_FAILOVER_FAIL); } if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0)); log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n", nodes[i].node_id, uxlogid, uxlogid, uxrecoff, uxrecoff); /* If position is 0/0, error */ if (uxlogid == 0 && uxrecoff == 0) { PQclear(res); PQfinish(node_conn); log_info(_("InvalidXLogRecPtr detected in a standby\n")); terminate(ERR_FAILOVER_FAIL); } XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff); PQclear(res); PQfinish(node_conn); } /* last we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be " " considered as new primary and exit.\n"), PQerrorMessage(my_local_conn)); PQclear(res); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); terminate(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res, 0, 0)); PQclear(res); for (i = 0; i < total_nodes; i++) { while (!nodes[i].is_ready) { /* * the witness will always be masked as ready if it's still not * marked that way and avoid a useless query */ if (nodes[i].is_witness) { if (!nodes[i].is_ready) { nodes[i].is_ready = true; ready_nodes++; } break; } /* if the node is not visible, skip it */ if (!nodes[i].is_visible) break; /* if the node is ready there is nothing to check, skip it too */ if (nodes[i].is_ready) break; node_conn = establish_db_connection(nodes[i].conninfo_str, false); /* * XXX This shouldn't happen, if this happens it means this is a * major problem maybe network outages? anyway, is better for a * human to react */ if (PQstatus(node_conn) != CONNECTION_OK) { /* XXX */ log_info(_("At this point, it could be some race conditions " "that are acceptable, assume the node is restarting " "and starting failover procedure\n")); break; } uxlogid = 0; uxrecoff = 0; sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema); res = PQexec(node_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not" "be considered as new primary and exit.\n"), PQerrorMessage(node_conn)); PQclear(res); PQfinish(node_conn); terminate(ERR_DB_QUERY); } if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) { log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0)); /* we can't do anything but fail at this point... */ if (*PQgetvalue(res, 0, 0) == '\0') { log_crit("Whoops, seems as if shared_preload_libraries=repmgr_funcs is not set!\n"); exit(ERR_BAD_CONFIG); } } PQclear(res); PQfinish(node_conn); /* If position is 0/0, keep checking */ if (uxlogid == 0 && uxrecoff == 0) continue; XLAssignValue(xlog_recptr, uxlogid, uxrecoff); if (XLByteLT(nodes[i].xlog_location, xlog_recptr)) { XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff); } log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n", nodes[i].node_id, uxlogid, uxlogid, uxrecoff, uxrecoff); ready_nodes++; nodes[i].is_ready = true; } } /* Close the connection to this server */ PQfinish(my_local_conn); my_local_conn = NULL; /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes; i++) { /* witness is never a good candidate */ if (nodes[i].is_witness) continue; if (!nodes[i].is_ready || !nodes[i].is_visible) continue; if (!find_best) { /* * start with the first ready node, and then move on to the next * one */ best_candidate.node_id = nodes[i].node_id; XLAssign(best_candidate.xlog_location, nodes[i].xlog_location); best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogRecPtr */ /* * Nodes are retrieved ordered by priority, so if the current best * candidate is lower than the next node's wal location then assign * next node as the new best candidate. */ if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.node_id = nodes[i].node_id; XLAssign(best_candidate.xlog_location, nodes[i].xlog_location); best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.node_id == local_options.node)) { if (best_candidate.is_witness) { log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"), progname); terminate(ERR_FAILOVER_FAIL); } /* wait */ sleep(5); if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); if (log_type == REPMGR_STDERR && *local_options.logfile) { fflush(stderr); } r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); terminate(ERR_BAD_CONFIG); } } else if (find_best) { /* wait */ sleep(10); if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.node_id); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. The follow command * should take care of that. */ if (log_type == REPMGR_STDERR && *local_options.logfile) { fflush(stderr); } r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); terminate(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); terminate(ERR_FAILOVER_FAIL); } /* to force it to re-calculate mode and master node */ failover_done = true; /* and reconnect to the local database */ my_local_conn = establish_db_connection(local_options.conninfo, true); }
static void do_failover(void) { PGresult *res1; PGresult *res2; char sqlquery[8192]; int total_nodes = 0; int visible_nodes = 0; bool find_best = false; bool witness = false; int i; int r; int node; char nodeConninfo[MAXLEN]; unsigned int uxlogid; unsigned int uxrecoff; char last_wal_standby_applied[MAXLEN]; PGconn *nodeConn = NULL; /* * will get info about until 50 nodes, * which seems to be large enough for most scenarios */ nodeInfo nodes[50]; /* initialize to keep compiler quiet */ nodeInfo best_candidate = {-1, InvalidXLogRecPtr, false, false}; /* first we get info about this node, and update shared memory */ sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()"); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn)); PQclear(res1); sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0); update_shared_memory(last_wal_standby_applied); exit(ERR_DB_QUERY); } /* write last location in shared memory */ update_shared_memory(PQgetvalue(res1, 0, 0)); /* * we sleep the monitor time + one second * we bet it should be enough for other repmgrd to update their own data */ sleep(SLEEP_MONITOR + 1); /* get a list of standby nodes, including myself */ sprintf(sqlquery, "SELECT id, conninfo, witness " " FROM %s.repl_nodes " " WHERE id <> %d " " AND cluster = '%s' " " ORDER BY priority ", repmgr_schema, primary_options.node, local_options.cluster_name); res1 = PQexec(myLocalConn, sqlquery); if (PQresultStatus(res1) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn)); PQclear(res1); PQfinish(myLocalConn); exit(ERR_DB_QUERY); } log_debug(_("%s: there are %d nodes registered"), progname, PQntuples(res1)); /* ask for the locations */ for (i = 0; i < PQntuples(res1); i++) { node = atoi(PQgetvalue(res1, i, 0)); /* Initialize on false so if we can't reach this node we know that later */ nodes[i].is_ready = false; strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN); witness = (strcmp(PQgetvalue(res1, i, 2), "t") == 0) ? true : false; log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s"), progname, node, nodeConninfo, (witness) ? "true" : "false"); nodeConn = establishDBConnection(nodeConninfo, false); /* if we can't see the node just skip it */ if (PQstatus(nodeConn) != CONNECTION_OK) continue; /* the witness will always show 0/0 so avoid a useless query */ if (!witness) { sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema); res2 = PQexec(nodeConn, sqlquery); if (PQresultStatus(res2) != PGRES_TUPLES_OK) { log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn)); log_info(_("Connection details: %s\n"), nodeConninfo); PQclear(res2); PQfinish(nodeConn); continue; } if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2) log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0)); PQclear(res2); } else { uxlogid = 0; uxrecoff = 0; } visible_nodes++; nodes[i].nodeId = node; nodes[i].xlog_location.xlogid = uxlogid; nodes[i].xlog_location.xrecoff = uxrecoff; nodes[i].is_ready = true; nodes[i].is_witness = witness; PQfinish(nodeConn); } PQclear(res1); /* Close the connection to this server */ PQfinish(myLocalConn); /* * total nodes that are registered, include master which is a node but was * not counted because it's not a standby */ total_nodes = i + 1; /* * am i on the group that should keep alive? * if i see less than half of total_nodes then i should do nothing */ if (visible_nodes < (total_nodes / 2.0)) { log_err(_("Can't reach most of the nodes.\n" "Let the other standby servers decide which one will be the primary.\n" "Manual action will be needed to readd this node to the cluster.\n")); exit(ERR_FAILOVER_FAIL); } /* * determine which one is the best candidate to promote to primary */ for (i = 0; i < total_nodes - 1; i++) { /* witness is never a good candidate */ if (nodes[i].is_witness) continue; if (!nodes[i].is_ready) continue; if (!find_best) { /* start with the first ready node, and then move on to the next one */ best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; find_best = true; } /* we use the macros provided by xlogdefs.h to compare XLogRecPtr */ /* * Nodes are retrieved ordered by priority, so if the current * best candidate is lower than the next node's wal location * then assign next node as the new best candidate. */ if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location)) { best_candidate.nodeId = nodes[i].nodeId; best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid; best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff; best_candidate.is_ready = nodes[i].is_ready; best_candidate.is_witness = nodes[i].is_witness; } } /* once we know who is the best candidate, promote it */ if (find_best && (best_candidate.nodeId == local_options.node)) { if (best_candidate.is_witness) { log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"), progname); exit(ERR_FAILOVER_FAIL); } if (verbose) log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), progname); log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command); r = system(local_options.promote_command); if (r != 0) { log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else if (find_best) { if (verbose) log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), progname, best_candidate.nodeId); log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command); /* * New Primary need some time to be promoted. * The follow command should take care of that. */ r = system(local_options.follow_command); if (r != 0) { log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname); exit(ERR_BAD_CONFIG); } } else { log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname); exit(ERR_FAILOVER_FAIL); } /* and reconnect to the local database */ myLocalConn = establishDBConnection(local_options.conninfo, true); }
static void bitmap_xlog_insert_bitmap(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_bitmappage *xlrec = (xl_bm_bitmappage*) XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); if (!RelationIsValid(reln)) return; if (redo) { Buffer bitmapBuffer; Page bitmapPage; BMBitmapOpaque bitmapPageOpaque ; bitmapBuffer = XLogReadBuffer(false, reln, xlrec->bm_bitmap_blkno); if (!BufferIsValid(bitmapBuffer)) elog(PANIC, "bm_insert_redo: block unfound: %d", xlrec->bm_bitmap_blkno); bitmapPage = BufferGetPage(bitmapBuffer); if (XLByteLT(PageGetLSN(bitmapPage), lsn)) { bitmapPageOpaque = (BMBitmapOpaque)PageGetSpecialPointer(bitmapPage);; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_bitmap: redo=%d, blkno=%d, isOpaque=%d, words_used=%d, lastword=%d, next_blkno=%d\n", redo, xlrec->bm_bitmap_blkno, xlrec->bm_isOpaque, xlrec->bm_lastword_pos, xlrec->bm_lastword_in_block, xlrec->bm_next_blkno))); #endif if (xlrec->bm_isOpaque) { if (bitmapPageOpaque->bm_bitmap_next != InvalidBlockNumber) elog(PANIC, "%s next bitmap page for blkno %d is already set", "bm_insert_redo: ", xlrec->bm_bitmap_blkno); Assert(bitmapPageOpaque->bm_hrl_words_used == BM_NUM_OF_HRL_WORDS_PER_PAGE); bitmapPageOpaque->bm_bitmap_next = xlrec->bm_next_blkno; } else { BMBitmap bitmap; if (bitmapPageOpaque->bm_hrl_words_used != xlrec->bm_lastword_pos - 1) elog(PANIC, "bm_insert_redo: a bit has been inserted in the pos %d", xlrec->bm_lastword_pos); Assert (xlrec->bm_lastword_in_block != 0); bitmap = (BMBitmap) PageGetContents(bitmapPage); bitmap->bm_headerWords [(bitmapPageOpaque->bm_hrl_words_used/BM_HRL_WORD_SIZE)] |= (1<<(BM_HRL_WORD_SIZE-1- (bitmapPageOpaque->bm_hrl_words_used%BM_HRL_WORD_SIZE))); bitmap->bm_contentWords[bitmapPageOpaque->bm_hrl_words_used] = xlrec->bm_lastword_in_block; bitmapPageOpaque->bm_hrl_words_used ++; } PageSetLSN(bitmapPage, lsn); PageSetTLI(bitmapPage, ThisTimeLineID); _bitmap_wrtbuf(bitmapBuffer); } else _bitmap_relbuf(bitmapBuffer); } else elog(PANIC, "bm_insert_undo: not implemented."); }
/* * Update the LSNs on each queue based upon our latest state. This * implements a simple policy of first-valid-standby-releases-waiter. * * Other policies are possible, which would change what we do here and what * perhaps also which information we store as well. */ void SyncRepReleaseWaiters(void) { volatile WalSndCtlData *walsndctl = WalSndCtl; volatile WalSnd *syncWalSnd = NULL; int numprocs = 0; int priority = 0; int i; /* * If this WALSender is serving a standby that is not on the list of * potential standbys then we have nothing to do. If we are still * starting up or still running base backup, then leave quickly also. */ if (MyWalSnd->sync_standby_priority == 0 || MyWalSnd->state < WALSNDSTATE_STREAMING) return; /* * We're a potential sync standby. Release waiters if we are the * highest priority standby. If there are multiple standbys with * same priorities then we use the first mentioned standby. * If you change this, also change pg_stat_get_wal_senders(). */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &walsndctl->walsnds[i]; if (walsnd->pid != 0 && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority)) { priority = walsnd->sync_standby_priority; syncWalSnd = walsnd; } } /* * We should have found ourselves at least. */ Assert(syncWalSnd); /* * If we aren't managing the highest priority standby then just leave. */ if (syncWalSnd != MyWalSnd) { LWLockRelease(SyncRepLock); announce_next_takeover = true; return; } if (XLByteLT(walsndctl->lsn, MyWalSnd->flush)) { /* * Set the lsn first so that when we wake backends they will * release up to this location. */ walsndctl->lsn = MyWalSnd->flush; numprocs = SyncRepWakeQueue(false); } LWLockRelease(SyncRepLock); elog(DEBUG3, "released %d procs up to %X/%X", numprocs, MyWalSnd->flush.xlogid, MyWalSnd->flush.xrecoff); /* * If we are managing the highest priority standby, though we weren't * prior to this, then announce we are now the sync standby. */ if (announce_next_takeover) { announce_next_takeover = false; ereport(LOG, (errmsg("standby \"%s\" is now the synchronous standby with priority %u", application_name, MyWalSnd->sync_standby_priority))); } }
static void gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * walk down, We don't lock page for a long time, but so we should be * ready to recheck path in a bad case... We remember, that page->lsn * should never be invalid. */ for (;;) { if (XLogRecPtrIsInvalid(state->stack->lsn)) state->stack->buffer = ReadBuffer(state->r, state->stack->blkno); LockBuffer(state->stack->buffer, GIST_SHARE); gistcheckpage(state->r, state->stack->buffer); state->stack->page = (Page) BufferGetPage(state->stack->buffer); opaque = GistPageGetOpaque(state->stack->page); state->stack->lsn = PageGetLSN(state->stack->page); Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn)); if (state->stack->blkno != GIST_ROOT_BLKNO && XLByteLT(state->stack->parent->lsn, opaque->nsn)) { /* * caused split non-root page is detected, go up to parent to * choose best child */ UnlockReleaseBuffer(state->stack->buffer); state->stack = state->stack->parent; continue; } if (!GistPageIsLeaf(state->stack->page)) { /* * This is an internal page, so continue to walk down the tree. We * find the child node that has the minimum insertion penalty and * recursively invoke ourselves to modify that node. Once the * recursive call returns, we may need to adjust the parent node * for two reasons: the child node split, or the key in this node * needs to be adjusted for the newly inserted key below us. */ GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate); iid = PageGetItemId(state->stack->page, state->stack->childoffnum); idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid); item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); LockBuffer(state->stack->buffer, GIST_UNLOCK); item->parent = state->stack; item->child = NULL; if (state->stack) state->stack->child = item; state->stack = item; } else { /* be carefull, during unlock/lock page may be changed... */ LockBuffer(state->stack->buffer, GIST_UNLOCK); LockBuffer(state->stack->buffer, GIST_EXCLUSIVE); state->stack->page = (Page) BufferGetPage(state->stack->buffer); opaque = GistPageGetOpaque(state->stack->page); if (state->stack->blkno == GIST_ROOT_BLKNO) { /* * the only page can become inner instead of leaf is a root * page, so for root we should recheck it */ if (!GistPageIsLeaf(state->stack->page)) { /* * very rarely situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(state->stack->buffer, GIST_UNLOCK); continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (XLByteLT(state->stack->parent->lsn, opaque->nsn)) { /* * detecting split during unlock/lock, so we should find * better child on parent */ /* forget buffer */ UnlockReleaseBuffer(state->stack->buffer); state->stack = state->stack->parent; continue; } state->stack->lsn = PageGetLSN(state->stack->page); /* ok we found a leaf page and it X-locked */ break; } } /* now state->stack->(page, buffer and blkno) points to leaf page */ }
/* * Fetch a tuples that matchs the search key; this can be invoked * either to fetch the first such tuple or subsequent matching * tuples. Returns true iff a matching tuple was found. */ static int gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids, int maxtids, bool ignore_killed_tuples) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber n; GISTScanOpaque so; GISTSearchStack *stk; IndexTuple it; GISTPageOpaque opaque; int ntids = 0; so = (GISTScanOpaque) scan->opaque; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; if ( so->qual_ok == false ) return 0; if (ItemPointerIsValid(&so->curpos) == false) { /* Being asked to fetch the first entry, so start at the root */ Assert(so->curbuf == InvalidBuffer); Assert(so->stack == NULL); so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); stk->next = NULL; stk->block = GIST_ROOT_BLKNO; pgstat_count_index_scan(scan->indexRelation); } else if (so->curbuf == InvalidBuffer) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return 0; } /* * check stored pointers from last visit */ if ( so->nPageData > 0 ) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * Go to the next page */ stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); } for (;;) { /* First of all, we need lock buffer */ Assert(so->curbuf != InvalidBuffer); LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(scan->indexRelation, so->curbuf); p = BufferGetPage(so->curbuf); opaque = GistPageGetOpaque(p); /* remember lsn to identify page changed for tuple's killing */ so->stack->lsn = PageGetLSN(p); /* check page split, occured from last visit or visit to parent */ if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && XLByteLT(so->stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ && (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already added */ ) { /* detect page split, follow right link to add pages */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); stk->next = so->stack->next; stk->block = opaque->rightlink; stk->parentlsn = so->stack->parentlsn; memset(&(stk->lsn), 0, sizeof(GistNSN)); so->stack->next = stk; } /* if page is empty, then just skip it */ if (PageIsEmpty(p)) { LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); continue; } if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; /* wonderful, we can look at page */ so->nPageData = so->curPageData = 0; for (;;) { n = gistfindnext(scan, n, dir); if (!OffsetNumberIsValid(n)) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * We ran out of matching index entries on the current page, * so pop the top stack entry and use it to continue the * search. */ LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); /* XXX go up */ break; } if (GistPageIsLeaf(p)) { /* * We've found a matching index entry in a leaf page, so * return success. Note that we keep "curbuf" pinned so that * we can efficiently resume the index scan later. */ if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n)))) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); so->pageData[ so->nPageData ].heapPtr = it->t_tid; so->pageData[ so->nPageData ].pageOffset = n; so->nPageData ++; } } else { /* * We've found an entry in an internal node whose key is * consistent with the search key, so push it to stack */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); memset(&(stk->lsn), 0, sizeof(GistNSN)); stk->parentlsn = so->stack->lsn; stk->next = so->stack->next; so->stack->next = stk; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; }
static void bitmap_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record) { xl_bm_newpage *xlrec = (xl_bm_newpage*) XLogRecGetData(record); Relation reln; Page page; uint8 info; /* xl_bm_metapage *xlrecMeta = (xl_bm_metapage*) ((char*)xlrec+sizeof(xl_bm_newpage)); */ info = record->xl_info & ~XLR_INFO_MASK; ereport(DEBUG1, (errmsg_internal("into --> XLogOpenRelation"))); reln = XLogOpenRelation(xlrec->bm_node); ereport(DEBUG1, (errmsg_internal("done --> XLogOpenRelation"))); if (!RelationIsValid(reln)) return; ereport(DEBUG1, (errmsg_internal("crash1"))); if (redo) { Buffer buffer; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_newpage: redo=%d, info=%x\n", redo, info))); #endif buffer = XLogReadBuffer(true, reln, xlrec->bm_new_blkno); if (!BufferIsValid(buffer)) elog(PANIC, "bm_insert_redo: block unfound: %d", xlrec->bm_new_blkno); page = BufferGetPage(buffer); if (XLByteLT(PageGetLSN(page), lsn)) { Buffer metabuf; BMMetaPage metapage; switch (info) { case XLOG_BITMAP_INSERT_NEWLOV: _bitmap_lovpageinit(reln, buffer); break; case XLOG_BITMAP_INSERT_NEWLOVMETA: _bitmap_lovmetapageinit(reln, buffer); break; case XLOG_BITMAP_INSERT_NEWBITMAP: _bitmap_bitmappageinit(reln, buffer); break; default: elog(PANIC, "bitmap_redo: unknown newpage op code %u", info); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); _bitmap_wrtbuf(buffer); metabuf = XLogReadBuffer(true, reln, BM_METAPAGE); if (!BufferIsValid(metabuf)) elog(PANIC, "bm_insert_redo: block unfound: %d", BM_METAPAGE); metapage = (BMMetaPage)BufferGetPage(metabuf); if (XLByteLT(PageGetLSN(metapage), lsn)) { PageSetLSN(metapage, lsn); PageSetTLI(metapage, ThisTimeLineID); _bitmap_wrtbuf(metabuf); } else _bitmap_relbuf(metabuf); } else { _bitmap_relbuf(buffer); } } else elog(PANIC, "bm_insert_undo: not implemented."); /* elog(PANIC, "call completely done for _bitmap_lovmetapageinit from bitmap_xlog_newpage[src/backend/access/bitmap/bitmapxlog.c]", info); */ }