static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples for those scans which could be confused by * finding new___ tuples at the old TID locations (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
static void spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; int nInsert; Buffer buffer; Page page; fillFakeState(&state, xldata->stateSrc); nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1; ptr += MAXALIGN(sizeof(spgxlogMoveLeafs)); toDelete = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nMoves); toInsert = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * nInsert); /* now ptr points to the list of leaf tuples */ /* Insert tuples on the dest page (do first, so redirect is valid) */ if (!(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoDst, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, SPGIST_LEAF); if (!XLByteLE(lsn, PageGetLSN(page))) { int i; for (i = 0; i < nInsert; i++) { SpGistLeafTuple lt = (SpGistLeafTuple) ptr; addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); ptr += lt->size; } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* Delete tuples from the source page, inserting a redirection pointer */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, SPGIST_PLACEHOLDER, xldata->blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* And update the parent downlink */ if (!(record->xl_info & XLR_BKP_BLOCK_3)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple tuple; tuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, xldata->blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr; SpGistInnerTuple prefixTuple; SpGistInnerTuple postfixTuple; Buffer buffer; Page page; /* we assume this is adequately aligned */ ptr += sizeof(spgxlogSplitTuple); prefixTuple = (SpGistInnerTuple) ptr; ptr += prefixTuple->size; postfixTuple = (SpGistInnerTuple) ptr; /* insert postfix tuple first to avoid dangling link */ if (xldata->blknoPostfix != xldata->blknoPrefix && !(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) postfixTuple, postfixTuple->size, xldata->offnumPostfix); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* now handle the original page */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageIndexTupleDelete(page, xldata->offnumPrefix); if (PageAddItem(page, (Item) prefixTuple, prefixTuple->size, xldata->offnumPrefix, false, false) != xldata->offnumPrefix) elog(ERROR, "failed to add item of size %u to SPGiST index page", prefixTuple->size); if (xldata->blknoPostfix == xldata->blknoPrefix) addOrReplaceTuple(page, (Item) postfixTuple, postfixTuple->size, xldata->offnumPostfix); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } }
static void gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; decodePageSplitRecord(&xlrec, record); /* * We must hold lock on the first-listed page throughout the action, * including while updating the left child page (if any). We can unlock * remaining pages in the list as soon as they've been written, because * there is no path for concurrent queries to reach those pages without * first visiting the first-listed page. */ /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { NewPage *newpage = xlrec.page + i; int flags; if (newpage->header->blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); /* ok, clear buffer */ if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); if (newpage->header->blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageSetNSN(page, xldata->orignsn); GistClearFollowRight(page); } else { if (i < xlrec.data->npage - 1) GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageSetNSN(page, xldata->orignsn); if (i < xlrec.data->npage - 1 && !isrootsplit && xldata->markfollowright) GistMarkFollowRight(page); else GistClearFollowRight(page); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); if (i == 0) firstbuffer = buffer; else UnlockReleaseBuffer(buffer); } /* Fix follow-right data on left child page, if any */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 0, xldata->node, xldata->leftchild); /* Finally, release lock on the first page */ UnlockReleaseBuffer(firstbuffer); }
/* * brinbuild() -- build a new BRIN index. */ IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; double idxtuples; BrinRevmap *revmap; BrinBuildState *state; Buffer meta; BlockNumber pagesPerRange; /* * We expect to be called exactly once for any index relation. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ meta = ReadBuffer(index, P_NEW); Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); MarkBufferDirty(meta); if (RelationNeedsWAL(index)) { xl_brin_createidx xlrec; XLogRecPtr recptr; Page page; xlrec.version = BRIN_CURRENT_VERSION; xlrec.pagesPerRange = BrinGetPagesPerRange(index); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT); recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); page = BufferGetPage(meta); PageSetLSN(page, recptr); } UnlockReleaseBuffer(meta); /* * Initialize our state, including the deformed tuple state. */ revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); state = initialize_brin_buildstate(index, revmap, pagesPerRange); /* * Now scan the relation. No syncscan allowed here because we want the * heap blocks in physical order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, brinbuildCallback, (void *) state); /* process the final batch */ form_and_insert_tuple(state); /* release resources */ idxtuples = state->bs_numtuples; brinRevmapTerminate(state->bs_rmAccess); terminate_brin_buildstate(state); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = idxtuples; return result; }
/* * Insert new tuple to the bloom index. */ bool blinsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BloomState blstate; BloomTuple *itup; MemoryContext oldCtx; MemoryContext insertCtx; BloomMetaPageData *metaData; Buffer buffer, metaBuffer; Page page, metaPage; BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); initBloomState(&blstate, index); itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); /* * At first, try to insert new tuple to the first page in notFullPage * array. If successful, we don't need to modify the meta page. */ metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); if (metaData->nEnd > metaData->nStart) { Page page; blkno = metaData->notFullPage[metaData->nStart]; Assert(blkno != InvalidBlockNumber); /* Don't hold metabuffer lock while doing insert */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, buffer, 0); /* * We might have found a page that was recently deleted by VACUUM. If * so, we can reuse it, but we must reinitialize it. */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the change, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); ReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); } else { /* No entries in notFullPage */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); } /* * Try other pages in notFullPage array. We will have to change nStart in * metapage. Thus, grab exclusive lock on metapage. */ LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); /* nStart might have changed while we didn't have lock */ nStart = metaData->nStart; /* Skip first page if we already tried it above */ if (nStart < metaData->nEnd && blkno == metaData->notFullPage[nStart]) nStart++; /* * This loop iterates for each page we try from the notFullPage array, and * will also initialize a GenericXLogState for the fallback case of having * to allocate a new page. */ for (;;) { state = GenericXLogStart(index); /* get modifiable copy of metapage */ metaPage = GenericXLogRegisterBuffer(state, metaBuffer, 0); metaData = BloomPageGetMeta(metaPage); if (nStart >= metaData->nEnd) break; /* no more entries in notFullPage array */ blkno = metaData->notFullPage[nStart]; Assert(blkno != InvalidBlockNumber); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = GenericXLogRegisterBuffer(state, buffer, 0); /* Basically same logic as above */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the changes, clean up, and exit */ metaData->nStart = nStart; GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); nStart++; } /* * Didn't find place to insert in notFullPage array. Allocate new page. * (XXX is it good to do this while holding ex-lock on the metapage??) */ buffer = BloomNewBuffer(index); page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE); BloomInitPage(page, 0); if (!BloomPageAddItem(&blstate, page, itup)) { /* We shouldn't be here since we're inserting to an empty page */ elog(ERROR, "could not add new bloom tuple to empty page"); } /* Reset notFullPage array to contain just this new page */ metaData->nStart = 0; metaData->nEnd = 1; metaData->notFullPage[0] = BufferGetBlockNumber(buffer); /* Apply the changes, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; }
/* * Get a buffer of the type and parity specified by flags, having at least * as much free space as indicated by needSpace. We use the lastUsedPages * cache to assign the same buffer previously requested when possible. * The returned buffer is already pinned and exclusive-locked. * * *isNew is set true if the page was initialized here, false if it was * already valid. */ Buffer SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew) { SpGistCache *cache = spgGetCache(index); SpGistLastUsedPage *lup; /* Bail out if even an empty page wouldn't meet the demand */ if (needSpace > SPGIST_PAGE_CAPACITY) elog(ERROR, "desired SPGiST tuple size is too big"); /* * If possible, increase the space request to include relation's * fillfactor. This ensures that when we add unrelated tuples to a page, * we try to keep 100-fillfactor% available for adding tuples that are * related to the ones already on it. But fillfactor mustn't cause an * error for requests that would otherwise be legal. */ needSpace += RelationGetTargetPageFreeSpace(index, SPGIST_DEFAULT_FILLFACTOR); needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY); /* Get the cache entry for this flags setting */ lup = GET_LUP(cache, flags); /* If we have nothing cached, just turn it over to allocNewBuffer */ if (lup->blkno == InvalidBlockNumber) { *isNew = true; return allocNewBuffer(index, flags); } /* fixed pages should never be in cache */ Assert(!SpGistBlockIsFixed(lup->blkno)); /* If cached freeSpace isn't enough, don't bother looking at the page */ if (lup->freeSpace >= needSpace) { Buffer buffer; Page page; buffer = ReadBuffer(index, lup->blkno); if (!ConditionalLockBuffer(buffer)) { /* * buffer is locked by another process, so return a new buffer */ ReleaseBuffer(buffer); *isNew = true; return allocNewBuffer(index, flags); } page = BufferGetPage(buffer); if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page)) { /* OK to initialize the page */ uint16 pageflags = 0; if (GBUF_REQ_LEAF(flags)) pageflags |= SPGIST_LEAF; if (GBUF_REQ_NULLS(flags)) pageflags |= SPGIST_NULLS; SpGistInitBuffer(buffer, pageflags); lup->freeSpace = PageGetExactFreeSpace(page) - needSpace; *isNew = true; return buffer; } /* * Check that page is of right type and has enough space. We must * recheck this since our cache isn't necessarily up to date. */ if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) && (GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page))) { int freeSpace = PageGetExactFreeSpace(page); if (freeSpace >= needSpace) { /* Success, update freespace info and return the buffer */ lup->freeSpace = freeSpace - needSpace; *isNew = false; return buffer; } } /* * fallback to allocation of new buffer */ UnlockReleaseBuffer(buffer); } /* No success with cache, so return a new buffer */ *isNew = true; return allocNewBuffer(index, flags); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; } stats->tuples_removed += ntodelete; if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); PageIndexMultiDelete(page, todelete, ntodelete); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(rel)); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } return stats; }
/* * VACUUM cleanup: update FSM */ IndexBulkDeleteResult * gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) return stats; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } UnlockReleaseBuffer(buffer); } /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); return stats; }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; instr_time starttime; instr_time currenttime; instr_time elapsed; /* Initialize the starttime if we check for conflicting lock requests */ INSTR_TIME_SET_CURRENT(starttime); /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * Check if another process requests a lock on our relation. We are * holding an AccessExclusiveLock here, so they will be waiting. We * only do this in autovacuum_truncate_lock_check millisecond * intervals, and we only check if that interval has elapsed once * every 32 blocks to keep the number of system calls and actual * shared lock table lookups to a minimum. */ if ((blkno % 32) == 0) { INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) >= AUTOVACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) { ereport(elevel, (errmsg("\"%s\": suspending truncate " "due to conflicting lock request", RelationGetRelationName(onerel)))); vacrelstats->lock_waiter_detected = true; return blkno; } starttime = currenttime; } } /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine prunes each page in the heap, which will among other * things truncate dead tuples to dead line pointers, defragment the * page, and set commit status bits (see heap_page_prune). It also builds * lists of dead tuples and pages with free space, calculates statistics * on the number of live tuples in the heap, and marks pages as * all-visible if appropriate. When done, or when we run low on space for * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap * to reclaim dead line pointers. * * If there are no indexes then we can reclaim line pointers on the fly; * dead line pointers need only be retained until all index pointers that * reference them have been killed. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. * * Note: The value returned by visibilitymap_test could be slightly * out-of-date, since we make this test before reading the corresponding * heap page or locking the buffer. This is OK. If we mistakenly think * that the page is all-visible when in fact the flag's just been cleared, * we might fail to vacuum the page. But it's OK to skip pages when * scan_all is not set, so no great harm done; the next vacuum will find * them. If we make the reverse mistake and vacuum a page unnecessarily, * it'll just be a no-op. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; TransactionId visibility_cutoff_xid = InvalidTransactionId; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* * Before beginning index vacuuming, we release any pin we may * hold on the visibility map page. This isn't necessary for * correctness, but we do it anyway to avoid holding the pin * across a lengthy, unrelated operation. */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* * Pin the visibility map page in case we need to mark the page * all-visible. In most cases this will be very cheap, because we'll * already have the correct page pinned anyway. However, it's * possible that (a) next_not_all_visible_block is covered by a * different VM page than the current block or (b) we released our pin * and did a cycle of index vacuuming. */ visibilitymap_pin(onerel, blkno, &vmbuffer); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ if (!ConditionalLockBufferForCleanup(buf)) { /* * If we're not scanning the whole relation to guard against XID * wraparound, it's OK to skip vacuuming a page. The next vacuum * will clean it up. */ if (!scan_all) { ReleaseBuffer(buf); continue; } /* * If this is a wraparound checking vacuum, then we read the page * with share lock to see if any xids need to be frozen. If the * page doesn't need attention we just skip and continue. If it * does, we wait for cleanup lock. * * We could defer the lock request further by remembering the page * and coming back to it later, or we could even register * ourselves for multiple buffers and then service whichever one * is received first. For now, this seems good enough. */ LockBuffer(buf, BUFFER_LOCK_SHARE); if (!lazy_check_needs_freeze(buf)) { UnlockReleaseBuffer(buf); continue; } LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* drop through to normal processing */ } vacrelstats->scanned_pages++; page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); /* empty pages are always all-visible */ if (!PageIsAllVisible(page)) { PageSetAllVisible(page); MarkBufferDirty(buf); visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, InvalidTransactionId); } UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, visibility_cutoff_xid)) visibility_cutoff_xid = xmin; } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* mark page all-visible, if appropriate */ if (all_visible) { if (!PageIsAllVisible(page)) { PageSetAllVisible(page); MarkBufferDirty(buf); visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, visibility_cutoff_xid); } else if (!all_visible_according_to_vm) { /* * It should never be the case that the visibility map page is * set while the page-level bit is clear, but the reverse is * allowed. Set the visibility map bit as well so that we get * back in sync. */ visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, visibility_cutoff_xid); } } /* * As of PostgreSQL 9.2, the visibility map bit should never be set if * the page-level bit is clear. However, it's possible that the bit * got cleared after we checked it and before we took the buffer * content lock, so we must recheck before jumping to the conclusion * that something bad has happened. */ else if (all_visible_according_to_vm && !PageIsAllVisible(page) && visibilitymap_test(onerel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", relname, blkno); visibilitymap_clear(onerel, blkno, vmbuffer); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(onerel, blkno, vmbuffer); } UnlockReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* * Release any remaining pin on visibility map page. */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; Buffer buffer; Page page; BTPageOpaque pageop; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* Fix left-link of right sibling */ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } /* Rewrite target page as empty deleted page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; pageop->btpo.xact = xlrec->btpo_xact; pageop->btpo_flags = BTP_DELETED; pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the * branch. */ if (XLogRecHasBlockRef(record, 3)) { /* * There is no real data on the page, so we just re-create it from * scratch using the information from the WAL record. */ IndexTupleData trunctuple; buffer = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; pageop->btpo.level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } /* Update metapage if needed */ if (info == XLOG_BTREE_UNLINK_PAGE_META) _bt_restore_meta(record, 4); }
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; IndexTupleData trunctuple; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* parent page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; ItemId itemid; IndexTuple itup; OffsetNumber nextoffset; BlockNumber rightsib; page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); poffset = xlrec->poffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); rightsib = ItemPointerGetBlockNumber(&itup->t_tid); itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Rewrite the leaf page as a halfdead page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; pageop->btpo.level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; /* * Construct a dummy hikey item that points to the next parent to be * deleted (if any). */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. This puts the work for calculating latestRemovedXid * into the recovery path rather than the primary path. * * It's possible that this generates a fair amount of I/O, since an index * block may have hundreds of tuples being deleted. Repeat accesses to the * same heap blocks are common, though are not yet optimised. * * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record) { xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new___ backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * In what follows, we have to examine the previous state of the index * page, as well as the heap page(s) it points to. This is only valid if * WAL replay has reached a consistent database state; which means that * the preceding check is not just an optimization, but is *necessary*. We * won't have let in any user sessions before we reach consistency. */ if (!reachedConsistency) elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data"); /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, BT_READ); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); for (i = 0; i < xlrec->nitems; i++) { /* * Identify the index tuple about to be deleted */ iitemid = PageGetItemId(ipage, unused[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }
/* * Process an index tuple. Runs the tuple down the tree until we reach a leaf * page or node buffer, and inserts the tuple there. Returns true if we have * to stop buffer emptying process (because one of child buffers can't take * index tuples anymore). */ static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, BlockNumber startblkno, int startlevel) { GISTSTATE *giststate = buildstate->giststate; GISTBuildBuffers *gfbb = buildstate->gfbb; Relation indexrel = buildstate->indexrel; BlockNumber childblkno; Buffer buffer; bool result = false; BlockNumber blkno; int level; OffsetNumber downlinkoffnum = InvalidOffsetNumber; BlockNumber parentblkno = InvalidBlockNumber; CHECK_FOR_INTERRUPTS(); /* * Loop until we reach a leaf page (level == 0) or a level with buffers * (not including the level we start at, because we would otherwise make * no progress). */ blkno = startblkno; level = startlevel; for (;;) { ItemId iid; IndexTuple idxtuple, newtup; Page page; OffsetNumber childoffnum; /* Have we reached a level with buffers? */ if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel) break; /* Have we reached a leaf page? */ if (level == 0) break; /* * Nope. Descend down to the next level then. Choose a child to * descend down to. */ buffer = ReadBuffer(indexrel, blkno); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); childoffnum = gistchoose(indexrel, page, itup, giststate); iid = PageGetItemId(page, childoffnum); idxtuple = (IndexTuple) PageGetItem(page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (level > 1) gistMemorizeParent(buildstate, childblkno, blkno); /* * Check that the key representing the target child node is consistent * with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); if (newtup) { blkno = gistbufferinginserttuples(buildstate, buffer, level, &newtup, 1, childoffnum, InvalidBlockNumber, InvalidOffsetNumber); /* gistbufferinginserttuples() released the buffer */ } else UnlockReleaseBuffer(buffer); /* Descend to the child */ parentblkno = blkno; blkno = childblkno; downlinkoffnum = childoffnum; Assert(level > 0); level--; } if (LEVEL_HAS_BUFFERS(level, gfbb)) { /* * We've reached level with buffers. Place the index tuple to the * buffer, and add the buffer to the emptying queue if it overflows. */ GISTNodeBuffer *childNodeBuffer; /* Find the buffer or create a new one */ childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level); /* Add index tuple to it */ gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb)) result = true; } else { /* * We've reached a leaf page. Place the tuple here. */ Assert(level == 0); buffer = ReadBuffer(indexrel, blkno); LockBuffer(buffer, GIST_EXCLUSIVE); gistbufferinginserttuples(buildstate, buffer, level, &itup, 1, InvalidOffsetNumber, parentblkno, downlinkoffnum); /* gistbufferinginserttuples() released the buffer */ } return result; }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * Insert tuples to a given page. * * This is analogous with gistinserttuples() in the regular insertion code. * * Returns the block number of the page where the (first) new or updated tuple * was inserted. Usually that's the original page, but might be a sibling page * if the original page was split. * * Caller should hold a lock on 'buffer' on entry. This function will unlock * and unpin it. */ static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber parentblk, OffsetNumber downlinkoffnum) { GISTBuildBuffers *gfbb = buildstate->gfbb; List *splitinfo; bool is_split; BlockNumber placed_to_blk = InvalidBlockNumber; is_split = gistplacetopage(buildstate->indexrel, buildstate->freespace, buildstate->giststate, buffer, itup, ntup, oldoffnum, &placed_to_blk, InvalidBuffer, &splitinfo, false); /* * If this is a root split, update the root path item kept in memory. This * ensures that all path stacks are always complete, including all parent * nodes up to the root. That simplifies the algorithm to re-find correct * parent. */ if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) { Page page = BufferGetPage(buffer); OffsetNumber off; OffsetNumber maxoff; Assert(level == gfbb->rootlevel); gfbb->rootlevel++; elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel); /* * All the downlinks on the old root page are now on one of the child * pages. Visit all the new child pages to memorize the parents of the * grandchildren. */ if (gfbb->rootlevel > 1) { maxoff = PageGetMaxOffsetNumber(page); for (off = FirstOffsetNumber; off <= maxoff; off++) { ItemId iid = PageGetItemId(page, off); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno); LockBuffer(childbuf, GIST_SHARE); gistMemorizeAllDownlinks(buildstate, childbuf); UnlockReleaseBuffer(childbuf); /* * Also remember that the parent of the new child page is the * root block. */ gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO); } } } if (splitinfo) { /* * Insert the downlinks to the parent. This is analogous with * gistfinishsplit() in the regular insertion code, but the locking is * simpler, and we have to maintain the buffers on internal nodes and * the parent map. */ IndexTuple *downlinks; int ndownlinks, i; Buffer parentBuffer; ListCell *lc; /* Parent may have changed since we memorized this path. */ parentBuffer = gistBufferingFindCorrectParent(buildstate, BufferGetBlockNumber(buffer), level, &parentblk, &downlinkoffnum); /* * If there's a buffer associated with this page, that needs to be * split too. gistRelocateBuildBuffersOnSplit() will also adjust the * downlinks in 'splitinfo', to make sure they're consistent not only * with the tuples already on the pages, but also the tuples in the * buffers that will eventually be inserted to them. */ gistRelocateBuildBuffersOnSplit(gfbb, buildstate->giststate, buildstate->indexrel, level, buffer, splitinfo); /* Create an array of all the downlink tuples */ ndownlinks = list_length(splitinfo); downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); i = 0; foreach(lc, splitinfo) { GISTPageSplitInfo *splitinfo = lfirst(lc); /* * Remember the parent of each new child page in our parent map. * This assumes that the downlinks fit on the parent page. If the * parent page is split, too, when we recurse up to insert the * downlinks, the recursive gistbufferinginserttuples() call will * update the map again. */ if (level > 0) gistMemorizeParent(buildstate, BufferGetBlockNumber(splitinfo->buf), BufferGetBlockNumber(parentBuffer)); /* * Also update the parent map for all the downlinks that got moved * to a different page. (actually this also loops through the * downlinks that stayed on the original page, but it does no * harm). */ if (level > 1) gistMemorizeAllDownlinks(buildstate, splitinfo->buf); /* * Since there's no concurrent access, we can release the lower * level buffers immediately. This includes the original page. */ UnlockReleaseBuffer(splitinfo->buf); downlinks[i++] = splitinfo->downlink; } /* Insert them into parent. */ gistbufferinginserttuples(buildstate, parentBuffer, level + 1, downlinks, ndownlinks, downlinkoffnum, InvalidBlockNumber, InvalidOffsetNumber); list_free_deep(splitinfo); /* we don't need this anymore */ }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; int npage; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Check that split didn't produce too many pages. */ npage = 0; for (ptr = dist; ptr; ptr = ptr->next) npage++; /* in a root split, we'll add one more page to the list below */ if (is_rootsplit) npage++; if (npage > GIST_MAX_SPLIT_PAGES) elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", npage, GIST_MAX_SPLIT_PAGES); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); /* Write the WAL record */ if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); } /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We also get here if ntuples==0. */ START_CRIT_SECTION(); if (OffsetNumberIsValid(oldoffnum)) PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(rel->rd_node, buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); PageSetLSN(page, recptr); } else { recptr = gistGetFakeLSN(rel); PageSetLSN(page, recptr); } if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }
/* * Build an SP-GiST index. */ Datum spgbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; SpGistBuildState buildstate; Buffer metabuffer, rootbuffer; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * Initialize the meta page and root page */ metabuffer = SpGistNewBuffer(index); rootbuffer = SpGistNewBuffer(index); Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO); Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_HEAD_BLKNO); START_CRIT_SECTION(); SpGistInitMetapage(BufferGetPage(metabuffer)); MarkBufferDirty(metabuffer); SpGistInitBuffer(rootbuffer, SPGIST_LEAF); MarkBufferDirty(rootbuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; /* WAL data is just the relfilenode */ rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, &rdata); PageSetLSN(BufferGetPage(metabuffer), recptr); PageSetTLI(BufferGetPage(metabuffer), ThisTimeLineID); PageSetLSN(BufferGetPage(rootbuffer), recptr); PageSetTLI(BufferGetPage(rootbuffer), ThisTimeLineID); } END_CRIT_SECTION(); UnlockReleaseBuffer(metabuffer); UnlockReleaseBuffer(rootbuffer); /* * Now insert all the heap data into the index */ initSpGistState(&buildstate.spgstate, index); buildstate.spgstate.isBuild = true; buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, spgistBuildCallback, (void *) &buildstate); MemoryContextDelete(buildstate.tmpCtx); SpGistUpdateMetaPage(index); result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = result->index_tuples = reltuples; PG_RETURN_POINTER(result); }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Fetch local cache of AM-specific info about the index, initializing it * if necessary */ SpGistCache * spgGetCache(Relation index) { SpGistCache *cache; if (index->rd_amcache == NULL) { Oid atttype; spgConfigIn in; FmgrInfo *procinfo; Buffer metabuffer; SpGistMetaPageData *metadata; cache = MemoryContextAllocZero(index->rd_indexcxt, sizeof(SpGistCache)); /* SPGiST doesn't support multi-column indexes */ Assert(index->rd_att->natts == 1); /* * Get the actual data type of the indexed column from the index * tupdesc. We pass this to the opclass config function so that * polymorphic opclasses are possible. */ atttype = index->rd_att->attrs[0]->atttypid; /* Call the config function to get config info for the opclass */ in.attType = atttype; procinfo = index_getprocinfo(index, 1, SPGIST_CONFIG_PROC); FunctionCall2Coll(procinfo, index->rd_indcollation[0], PointerGetDatum(&in), PointerGetDatum(&cache->config)); /* Get the information we need about each relevant datatype */ fillTypeDesc(&cache->attType, atttype); fillTypeDesc(&cache->attPrefixType, cache->config.prefixType); fillTypeDesc(&cache->attLabelType, cache->config.labelType); /* Last, get the lastUsedPages data from the metapage */ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); LockBuffer(metabuffer, BUFFER_LOCK_SHARE); metadata = SpGistPageGetMeta(BufferGetPage(metabuffer)); if (metadata->magicNumber != SPGIST_MAGIC_NUMBER) elog(ERROR, "index \"%s\" is not an SP-GiST index", RelationGetRelationName(index)); cache->lastUsedPages = metadata->lastUsedPages; UnlockReleaseBuffer(metabuffer); index->rd_amcache = (void *) cache; } else { /* assume it's up to date */ cache = (SpGistCache *) index->rd_amcache; } return cache; }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns a new insertion stack, starting from the parent of "child", up * to the root. *downlinkoffnum is set to the offset of the downlink in the * direct parent of child. * * To prevent deadlocks, this should lock only one page at a time. */ static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; List *fifo; GISTInsertStack *top, *ptr; BlockNumber blkno; top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; top->downlinkoffnum = InvalidOffsetNumber; fifo = list_make1(top); while (fifo != NIL) { /* Get next page to visit */ top = linitial(fifo); fifo = list_delete_first(fifo); buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* * Because we scan the index top-down, all the rest of the pages * in the queue must be leaf pages as well. */ UnlockReleaseBuffer(buffer); break; } top->lsn = PageGetLSN(page); /* * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a * downlink. This should not normally happen.. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); if (top->parent && top->parent->lsn < GistPageGetNSN(page) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* * Page was split while we looked elsewhere. We didn't see the * downlink to the right page when we scanned the parent, so add * it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal * level, just above leaves, we might already have queued up some * leaf pages, and we assume that there can't be any non-leaf * pages behind leaf pages. */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->downlinkoffnum = InvalidOffsetNumber; ptr->parent = top->parent; fifo = lcons(ptr, fifo); } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { /* Found it! */ UnlockReleaseBuffer(buffer); *downlinkoffnum = i; return top; } else { /* Append this child to the list of pages to visit later */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->downlinkoffnum = i; ptr->parent = top; fifo = lappend(fifo, ptr); } } UnlockReleaseBuffer(buffer); } elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); return NULL; /* keep compiler quiet */ }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; /* * We need to acquire and hold lock on target page while updating the left * child page. If we have a full-page image of target page, getting the * lock is a side-effect of restoring that image. Note that even if the * target page no longer exists, we'll still attempt to replay the change * on the child page. */ if (record->xl_info & XLR_BKP_BLOCK(0)) buffer = RestoreBackupBlock(lsn, record, 0, false, true); else buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); /* Fix follow-right data on left child page */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 1, xldata->node, xldata->leftchild); /* Done if target page no longer exists */ if (!BufferIsValid(buffer)) return; /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK(0)) { UnlockReleaseBuffer(buffer); return; } page = (Page) BufferGetPage(buffer); /* nothing more to do if change already applied */ if (lsn <= PageGetLSN(page)) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) { /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); } GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * Updates the stack so that child->parent is the correct parent of the * child. child->parent must be exclusively locked on entry, and will * remain so at exit, but it might not be the same page anymore. */ static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); /* here we don't need to distinguish between split and page update */ if (child->downlinkoffnum == InvalidOffsetNumber || parent->lsn != PageGetLSN(parent->page)) { /* parent is changed, look child in right links until found */ OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *ptr; while (true) { maxoff = PageGetMaxOffsetNumber(parent->page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(parent->page, i); idxtuple = (IndexTuple) PageGetItem(parent->page, iid); if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) { /* yes!!, found */ child->downlinkoffnum = i; return; } } parent->blkno = GistPageGetOpaque(parent->page)->rightlink; UnlockReleaseBuffer(parent->buffer); if (parent->blkno == InvalidBlockNumber) { /* * End of chain and still didn't find parent. It's a very-very * rare situation when root splited. */ break; } parent->buffer = ReadBuffer(r, parent->blkno); LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); } /* * awful!!, we need search tree to find parent ... , but before we * should release all old parent */ ptr = child->parent->parent; /* child->parent already released * above */ while (ptr) { ReleaseBuffer(ptr->buffer); ptr = ptr->parent; } /* ok, find new path */ ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); /* read all buffers as expected by caller */ /* note we don't lock them or gistcheckpage them here! */ while (ptr) { ptr->buffer = ReadBuffer(r, ptr->blkno); ptr->page = (Page) BufferGetPage(ptr->buffer); ptr = ptr->parent; } /* install new chain of parents to stack */ child->parent = parent; /* make recursive call to normal processing */ LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); gistFindCorrectParent(r, child); } return; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation index = info->index; BlockNumber blkno, npages; FreeBlockNumberArray notFullPage; int countPage = 0; BloomState state; Buffer buffer; Page page; GenericXLogState *gxlogState; if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initBloomState(&state, index); /* * Interate over the pages. We don't care about concurrently added pages, * they can't contain tuples to delete. */ npages = RelationGetNumberOfBlocks(index); for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { BloomTuple *itup, *itupPtr, *itupEnd; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); gxlogState = GenericXLogStart(index); page = GenericXLogRegisterBuffer(gxlogState, buffer, 0); if (BloomPageIsDeleted(page)) { UnlockReleaseBuffer(buffer); GenericXLogAbort(gxlogState); CHECK_FOR_INTERRUPTS(); continue; } /* Iterate over the tuples */ itup = itupPtr = BloomPageGetTuple(&state, page, FirstOffsetNumber); itupEnd = BloomPageGetTuple(&state, page, OffsetNumberNext(BloomPageGetMaxOffset(page))); while (itup < itupEnd) { /* Do we have to delete this tuple? */ if (callback(&itup->heapPtr, callback_state)) { stats->tuples_removed += 1; BloomPageGetOpaque(page)->maxoff--; } else { if (itupPtr != itup) { /* * If we already delete something before, we have to move * this tuple backward. */ memmove((Pointer) itupPtr, (Pointer) itup, state.sizeOfBloomTuple); } stats->num_index_tuples++; itupPtr = BloomPageGetNextTuple(&state, itupPtr); } itup = BloomPageGetNextTuple(&state, itup); } Assert(itupPtr == BloomPageGetTuple(&state, page, OffsetNumberNext(BloomPageGetMaxOffset(page)))); /* * Add page to notFullPage list if we will not mark page as deleted and * there is a free space on it */ if (BloomPageGetMaxOffset(page) != 0 && BloomPageGetFreeSpace(&state, page) > state.sizeOfBloomTuple && countPage < BloomMetaBlockN) notFullPage[countPage++] = blkno; /* Did we delete something? */ if (itupPtr != itup) { /* Is it empty page now? */ if (BloomPageGetMaxOffset(page) == 0) BloomPageSetDeleted(page); /* Adjust pg_lower */ ((PageHeader) page)->pd_lower = (Pointer) itupPtr - page; /* Finish WAL-logging */ GenericXLogFinish(gxlogState); } else { /* Didn't change anything: abort WAL-logging */ GenericXLogAbort(gxlogState); } UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } if (countPage > 0) { BloomMetaPageData *metaData; buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); gxlogState = GenericXLogStart(index); page = GenericXLogRegisterBuffer(gxlogState, buffer, 0); metaData = BloomPageGetMeta(page); memcpy(metaData->notFullPage, notFullPage, sizeof(BlockNumber) * countPage); metaData->nStart = 0; metaData->nEnd = countPage; GenericXLogFinish(gxlogState); UnlockReleaseBuffer(buffer); } return stats; }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
static void spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; SpGistInnerTuple innerTuple; SpGistState state; Buffer buffer; Page page; int bbi; /* we assume this is adequately aligned */ ptr += sizeof(spgxlogAddNode); innerTuple = (SpGistInnerTuple) ptr; fillFakeState(&state, xldata->stateSrc); if (xldata->blknoNew == InvalidBlockNumber) { /* update in place */ Assert(xldata->blknoParent == InvalidBlockNumber); if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) innerTuple, innerTuple->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", innerTuple->size); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } else { /* Install new tuple first so redirect is valid */ if (!(record->xl_info & XLR_BKP_BLOCK_2)) { buffer = XLogReadBuffer(xldata->node, xldata->blknoNew, xldata->newPage); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->newPage) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, xldata->offnumNew); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* Delete old tuple, replacing it with redirect or placeholder tuple */ if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistDeadTuple dt; if (state.isBuild) dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); else dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, xldata->blknoNew, xldata->offnumNew); PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) dt, dt->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", dt->size); if (state.isBuild) SpGistPageGetOpaque(page)->nPlaceholder++; else SpGistPageGetOpaque(page)->nRedirection++; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } /* * Update parent downlink. Since parent could be in either of the * previous two buffers, it's a bit tricky to determine which BKP bit * applies. */ if (xldata->blknoParent == xldata->blkno) bbi = 0; else if (xldata->blknoParent == xldata->blknoNew) bbi = 1; else bbi = 2; if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple innerTuple; innerTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(innerTuple, xldata->nodeI, xldata->blknoNew, xldata->offnumNew); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ Datum gistbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * We can't yet handle unlogged GiST indexes, because we depend on LSNs. * This is duplicative of an error in gistbuildempty, but we want to check * here so as to throw error before doing all the index-build work. */ if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unlogged GiST indexes are not supported"))); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); gistFreeBuildBuffers(buildstate.gfbb); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; PG_RETURN_POINTER(result); }
static void spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) { char *ptr = XLogRecGetData(record); spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; SpGistInnerTuple innerTuple; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; uint8 *leafPageSelect; Buffer srcBuffer; Buffer destBuffer; Page page; int bbi; int i; fillFakeState(&state, xldata->stateSrc); ptr += MAXALIGN(sizeof(spgxlogPickSplit)); innerTuple = (SpGistInnerTuple) ptr; ptr += innerTuple->size; toDelete = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nDelete); toInsert = (OffsetNumber *) ptr; ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nInsert); leafPageSelect = (uint8 *) ptr; ptr += MAXALIGN(sizeof(uint8) * xldata->nInsert); /* now ptr points to the list of leaf tuples */ /* * It's a bit tricky to identify which pages have been handled as * full-page images, so we explicitly count each referenced buffer. */ bbi = 0; if (xldata->blknoSrc == SPGIST_HEAD_BLKNO) { /* when splitting root, we touch it only in the guise of new inner */ srcBuffer = InvalidBuffer; } else if (xldata->initSrc) { /* just re-init the source page */ srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true); Assert(BufferIsValid(srcBuffer)); page = (Page) BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, SPGIST_LEAF); /* don't update LSN etc till we're done with it */ } else { /* delete the specified tuples from source page */ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(srcBuffer)) { page = BufferGetPage(srcBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { /* * We have it a bit easier here than in doPickSplit(), * because we know the inner tuple's location already, * so we can inject the correct redirection tuple now. */ if (!state.isBuild) spgPageIndexMultiDelete(&state, page, toDelete, xldata->nDelete, SPGIST_REDIRECT, SPGIST_PLACEHOLDER, xldata->blknoInner, xldata->offnumInner); else spgPageIndexMultiDelete(&state, page, toDelete, xldata->nDelete, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* don't update LSN etc till we're done with it */ } } } else srcBuffer = InvalidBuffer; bbi++; } /* try to access dest page if any */ if (xldata->blknoDest == InvalidBlockNumber) { destBuffer = InvalidBuffer; } else if (xldata->initDest) { /* just re-init the dest page */ destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true); Assert(BufferIsValid(destBuffer)); page = (Page) BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, SPGIST_LEAF); /* don't update LSN etc till we're done with it */ } else { if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false); else destBuffer = InvalidBuffer; bbi++; } /* restore leaf tuples to src and/or dest page */ for (i = 0; i < xldata->nInsert; i++) { SpGistLeafTuple lt = (SpGistLeafTuple) ptr; Buffer leafBuffer; ptr += lt->size; leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer; if (!BufferIsValid(leafBuffer)) continue; /* no need to touch this page */ page = BufferGetPage(leafBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); } } /* Now update src and dest page LSNs */ if (BufferIsValid(srcBuffer)) { page = BufferGetPage(srcBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(srcBuffer); } UnlockReleaseBuffer(srcBuffer); } if (BufferIsValid(destBuffer)) { page = BufferGetPage(destBuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(destBuffer); } UnlockReleaseBuffer(destBuffer); } /* restore new inner tuple */ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoInner, xldata->initInner); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (xldata->initInner) SpGistInitBuffer(buffer, 0); if (!XLByteLE(lsn, PageGetLSN(page))) { addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, xldata->offnumInner); /* if inner is also parent, update link while we're here */ if (xldata->blknoInner == xldata->blknoParent) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, xldata->blknoInner, xldata->offnumInner); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } bbi++; /* update parent downlink, unless we did it above */ if (xldata->blknoParent == InvalidBlockNumber) { /* no parent cause we split the root */ Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO); } else if (xldata->blknoInner != xldata->blknoParent) { if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) { page = BufferGetPage(buffer); if (!XLByteLE(lsn, PageGetLSN(page))) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, xldata->blknoInner, xldata->offnumInner); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } }
static void btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page * this is a downlink for. (Like in btree_xlog_insert, this can be done * before locking the other pages) */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new___) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = leftsib; ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); /* * On leaf level, the high key of the left page is equal to the first key * on the right page. */ if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); left_hikey = PageGetItem(rpage, hiItemId); left_hikeysz = ItemIdGetLength(hiItemId); } PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. It's not strictly required to retain the same physical * order, as long as the items are in the correct item number order, * but it helps debugging. See also _bt_restore_page(), which does * the same for the right page. */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; Item newitem = NULL; Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; datapos = XLogRecGetBlockData(record, 0, &datalen); if (onleft) { newitem = (Item) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } /* Extract left hikey and its size (assuming 16-bit alignment) */ if (!isleaf) { left_hikey = (Item) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; } Assert(datalen == 0); newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ leftoff = P_HIKEY; if (PageAddItem(newlpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) { ItemId itemid; Size itemsz; Item item; /* add the new___ item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new___ item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(lpage, off); itemsz = ItemIdGetLength(itemid); item = PageGetItem(lpage, itemid); if (PageAddItem(newlpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new___ item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(newlpage, lpage); /* Fix opaque fields */ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } /* We no longer need the buffers */ if (BufferIsValid(lbuf)) UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* * Fix left-link of the page to the right of the new___ right sibling. * * Note: in normal operation, we do this while still holding lock on the * two split pages. However, that's not necessary for correctness in WAL * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ if (rnext != P_NONE) { Buffer buffer; if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }