/* * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write * lock is OK). This lock and pin will be dropped before exiting. * * The "stack" argument can be a search stack leading (approximately) to the * target page, or NULL --- outside callers typically pass NULL since they * have not done such a search, but internal recursion cases pass the stack * to avoid duplicated search effort. * * Returns the number of pages successfully deleted (zero if page cannot * be deleted now; could be more than one if parent pages were deleted too). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, BTStack stack) { int result; BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; IndexTuple targetkey, itup; ScanKey itup_scankey; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); /* * To avoid deadlocks, we'd better drop the target page lock before going * further. */ _bt_relbuf(rel, buf); /* * We need an approximate pointer to the page's parent page. We use the * standard search mechanism to search for the page's high key; this will * give us a link to either the current parent or someplace to its left * (if there are multiple equal high keys). In recursion cases, the * caller already generated a search stack and we can just re-use that * work. */ if (stack == NULL) { if (!InRecovery) { /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did * more than we needed. Locate the stack item pointing to our * parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } } else { /* * During WAL recovery, we can't use _bt_search (for one reason, * it might invoke user-defined comparison functions that expect * facilities not available in recovery mode). Instead, just set * up a dummy stack pointing to the left end of the parent tree * level, from which _bt_getstackbuf will walk right to the parent * page. Painful, but we don't care too much about performance in * this scenario. */ pbuf = _bt_get_endpoint(rel, targetlevel + 1, false); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; /* bts_btentry will be initialized below */ stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } } /* * We cannot delete a page that is the rightmost child of its immediate * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete. We * don't need to re-test when deleting a non-leaf page, though. */ if (targetlevel == 0 && !_bt_parent_deletion_safe(rel, target, stack)) return 0; /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != target) elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", rightsib, opaque->btpo_prev, target, RelationGetRelationName(rel)); /* * Any insert which would have gone on the target block will now go to the * right sibling block. */ PredicateLockPageCombine(rel, target, rightsib); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. The "can't delete" case should have been * detected by _bt_parent_deletion_safe, so complain if we see it now. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"", target, parent, RelationGetRelationName(rel)); } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We don't support handling this in the case where the parent is becoming * half-dead, even though it theoretically could occur. * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE && !parent_half_dead) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Check that the parent-page index items we're about to delete/overwrite * contain what we expect. This can fail if the index has become * corrupt for some reason. We want to throw any error before entering * the critical section --- otherwise it'd be a PANIC. * * The test on the target item is just an Assert because _bt_getstackbuf * should have guaranteed it has the expected contents. The test on the * next-child downlink is known to sometimes fail in the field, though. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); #endif if (!parent_half_dead) { OffsetNumber nextoffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), parent, RelationGetRelationName(rel)); } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. Asserts here are just rechecking * things we already verified above. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); } /* Must mark buffers dirty before XLogInsert */ MarkBufferDirty(pbuf); MarkBufferDirty(rbuf); MarkBufferDirty(buf); if (BufferIsValid(lbuf)) MarkBufferDirty(lbuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; xlrec.btpo_xact = opaque->btpo.xact; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else if (parent_half_dead) xlinfo = XLOG_BTREE_DELETE_PAGE_HALF; else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* release metapage; send out relcache inval if metapage changed */ if (BufferIsValid(metabuf)) { CacheInvalidateRelcache(rel); _bt_relbuf(rel, metabuf); } /* can always release leftsib immediately */ if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); /* * If parent became half dead, recurse to delete it. Otherwise, if right * sibling is empty and is now the last child of the parent, recurse to * try to delete it. (These cases cannot apply at the same time, though * the second case might itself recurse to the first.) * * When recursing to parent, we hold the lock on the target page until * done. This delays any insertions into the keyspace that was just * effectively reassigned to the parent's right sibling. If we allowed * that, and there were enough such insertions before we finish deleting * the parent, page splits within that keyspace could lead to inserting * out-of-order keys into the grandparent level. It is thought that that * wouldn't have any serious consequences, but it still seems like a * pretty bad idea. */ if (parent_half_dead) { /* recursive call will release pbuf */ _bt_relbuf(rel, rbuf); result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1; _bt_relbuf(rel, buf); } else if (parent_one_child && rightsib_empty) { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); /* recursive call will release rbuf */ result = _bt_pagedel(rel, rbuf, stack) + 1; } else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); _bt_relbuf(rel, rbuf); result = 1; } return result; }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling * this function. Except in recovery, caller should also pass the heap * buffer. When checksums are enabled and we're not in recovery, we must add * the heap buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); /* * If data checksums are enabled (or wal_log_hints=on), we * need to protect the heap page from being torn. */ if (XLogHintBitIsNeeded()) { Page heapPage = BufferGetPage(heapBuf); /* caller is expected to set PD_ALL_VISIBLE first */ Assert(PageIsAllVisible(heapPage)); PageSetLSN(heapPage, recptr); } } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(heap)); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); gistFreeBuildBuffers(buildstate.gfbb); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; return result; }
/* * Build a pending-list page from the given array of tuples, and write it out. * * Returns amount of free space left on the page. */ static int32 writeListPage(Relation index, Buffer buffer, IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) { Page page = BufferGetPage(buffer); int32 i, freesize, size = 0; OffsetNumber l, off; char *workspace; char *ptr; /* workspace could be a local array; we use palloc for alignment */ workspace = palloc(BLCKSZ); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_LIST); off = FirstOffsetNumber; ptr = workspace; for (i = 0; i < ntuples; i++) { int this_size = IndexTupleSize(tuples[i]); memcpy(ptr, tuples[i], this_size); ptr += this_size; size += this_size; l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); off++; } Assert(size <= BLCKSZ); /* else we overran workspace */ GinPageGetOpaque(page)->rightlink = rightlink; /* * tail page may contain only whole row(s) or final part of row placed on * previous pages (a "row" here meaning all the index tuples generated for * one heap tuple) */ if (rightlink == InvalidBlockNumber) { GinPageSetFullRow(page); GinPageGetOpaque(page)->maxoff = 1; } else { GinPageGetOpaque(page)->maxoff = 0; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { ginxlogInsertListPage data; XLogRecPtr recptr; data.rightlink = rightlink; data.ntuples = ntuples; XLogBeginInsert(); XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); XLogRegisterBufData(0, workspace, size); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); PageSetLSN(page, recptr); } /* get free space before releasing buffer */ freesize = PageGetExactFreeSpace(page); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); pfree(workspace); return freesize; }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright, Relation heapRel, bool is_build) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); /* * If leaf page is full, try at first to delete dead tuples. And then * check again. */ if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) { gistprunepage(rel, page, buffer, heapRel); is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); } if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; int npage; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Check that split didn't produce too many pages. */ npage = 0; for (ptr = dist; ptr; ptr = ptr->next) npage++; /* in a root split, we'll add one more page to the list below */ if (is_rootsplit) npage++; if (npage > GIST_MAX_SPLIT_PAGES) elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", npage, GIST_MAX_SPLIT_PAGES); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); PredicateLockPageSplit(rel, BufferGetBlockNumber(buffer), BufferGetBlockNumber(ptr->buffer)); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } /* * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL * insertion for that. NB: The number of pages and data segments * specified here must match the calculations in gistXLogSplit()! */ if (!is_build && RelationNeedsWAL(rel)) XLogEnsureRecordSpace(npage, 1 + npage * 2); START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); /* * Write the WAL record. * * If we're building a new index, however, we don't WAL-log changes * yet. The LSN-NSN interlock between parent and child requires that * LSNs never move backwards, so set the LSNs to a value that's * smaller than any real or fake unlogged LSN that might be generated * later. (There can't be any concurrent scans during index build, so * we don't need to be able to detect concurrent splits yet.) */ if (is_build) recptr = GistBuildLSN; else { if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); } for (ptr = dist; ptr; ptr = ptr->next) PageSetLSN(ptr->page, recptr); /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We always get here if ntup==0. */ START_CRIT_SECTION(); /* * Delete old tuple if any, then insert new tuple(s) if any. If * possible, use the fast path of PageIndexTupleOverwrite. */ if (OffsetNumberIsValid(oldoffnum)) { if (ntup == 1) { /* One-for-one replacement, so use PageIndexTupleOverwrite */ if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, IndexTupleSize(*itup))) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); } else { /* Delete old, then append new tuple(s) to page */ PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); } } else { /* Just append new tuples at the end of the page */ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); } MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (is_build) recptr = GistBuildLSN; else { if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); } else recptr = gistGetFakeLSN(rel); } PageSetLSN(page, recptr); if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }
/* * Prune and repair fragmentation in the specified page. * * Caller must have pin and buffer cleanup lock on the page. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * * Returns the number of tuples deleted from the page and sets * latestRemovedXid. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; Page page = BufferGetPage(buffer); OffsetNumber offnum, maxoff; PruneState prstate; /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much * logic as possible out of the critical section, and also ensures that * WAL replay will work the same as the normal case. * * First, initialize the new pd_prune_xid value to zero (indicating no * prunable tuples). If we find any tuples which may soon become * prunable, we will save the lowest relevant XID in new_prune_xid. Also * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; /* Ignore items already processed as part of an earlier chain */ if (prstate.marked[offnum]) continue; /* Nothing to do if slot is empty or already dead */ itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) continue; /* Process this item or chain of items */ ndeleted += heap_prune_chain(relation, buffer, offnum, OldestXmin, &prstate); } /* Any error while applying the changes is critical */ START_CRIT_SECTION(); /* Have we found any prunable items? */ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) { /* * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* * Also clear the "page is full" flag, since there's no point in * repeating the prune/defrag process until something else happens to * the page. */ PageClearFull(page); MarkBufferDirty(buffer); /* * Emit a WAL HEAP_CLEAN record showing what we did */ if (RelationNeedsWAL(relation)) { XLogRecPtr recptr; recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, prstate.latestRemovedXid); PageSetLSN(BufferGetPage(buffer), recptr); } } else { /* * If we didn't prune anything, but have found a new value for the * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * * Also clear the "page is full" flag if it is set, since there's no * point in repeating the prune/defrag process until something else * happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); MarkBufferDirtyHint(buffer, true); } } END_CRIT_SECTION(); /* * If requested, report the number of tuples reclaimed to pgstats. This is * ndeleted minus ndead, because we don't want to count a now-DEAD root * item as a deletion for this purpose. */ if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); *latestRemovedXid = prstate.latestRemovedXid; /* * XXX Should we update the FSM information of this page ? * * There are two schools of thought here. We may not want to update FSM * information so that the page is not used for unrelated UPDATEs/INSERTs * and any free space in this page will remain available for further * UPDATEs in *this* page, thus improving chances for doing HOT updates. * * But for a large table and where a page does not receive further UPDATEs * for a long time, we might waste this space by not updating the FSM * information. The relation may get extended and fragmented further. * * One possibility is to leave "fillfactor" worth of space in this page * and update FSM with the remaining space. */ return ndeleted; }
/* * Write the index tuples contained in *collector into the index's * pending list. * * Function guarantees that all these tuples will be inserted consecutively, * preserving order */ void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; int cleanupSize; bool needWal; if (collector->ntuples == 0) return; needWal = RelationNeedsWAL(index); data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* * Total size is greater than one page => make sublist */ separateList = true; } else { LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber || collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) { /* * Pending list is empty or total size is greater than freespace * on tail page => make sublist * * We unlock metabuffer to keep high concurrency */ separateList = true; LockBuffer(metabuffer, GIN_UNLOCK); } } if (separateList) { /* * We should make sublist separately and append it to the tail */ GinMetaPageData sublist; memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); if (needWal) XLogBeginInsert(); /* * metapage was unlocked, see above */ LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); metadata->head = sublist.head; metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { /* * Merge lists */ data.prevTail = metadata->tail; data.newRightlink = sublist.head; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); GinPageGetOpaque(page)->rightlink = sublist.head; MarkBufferDirty(buffer); metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; if (needWal) XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); } } else { /* * Insert into tail page. Metapage is already locked */ OffsetNumber l, off; int i, tupsize; char *ptr; char *collectordata; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); collectordata = ptr = (char *) palloc(collector->sumsize); data.ntuples = collector->ntuples; if (needWal) XLogBeginInsert(); START_CRIT_SECTION(); /* * Increase counter of heap tuples */ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); GinPageGetOpaque(page)->maxoff++; metadata->nPendingHeapTuples++; for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); memcpy(ptr, collector->tuples[i], tupsize); ptr += tupsize; off++; } Assert((ptr - collectordata) <= collector->sumsize); if (needWal) { XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); XLogRegisterBufData(1, collectordata, collector->sumsize); } metadata->tailFreeSize = PageGetExactFreeSpace(page); MarkBufferDirty(buffer); } /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c compresses * the page. (We must do this here because pre-v11 versions of PG did not * set the metapage's pd_lower correctly, so a pg_upgraded index might * contain the wrong value.) */ ((PageHeader) metapage)->pd_lower = ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; /* * Write metabuffer, make xlog entry */ MarkBufferDirty(metabuffer); if (needWal) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); } } if (buffer != InvalidBuffer) UnlockReleaseBuffer(buffer); /* * Force pending list cleanup when it becomes too long. And, * ginInsertCleanup could take significant amount of time, so we prefer to * call it when it can do all the work in a single collection cycle. In * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it * while pending list is still small enough to fit into * gin_pending_list_limit. * * ginInsertCleanup() should not be called inside our CRIT_SECTION. */ cleanupSize = GinGetPendingListCleanupSize(index); if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L) needCleanup = true; UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); /* * Since it could contend with concurrent cleanup process we cleanup * pending list not forcibly. */ if (needCleanup) ginInsertCleanup(ginstate, false, true, false, NULL); }
/* * brinbuild() -- build a new BRIN index. */ IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; double idxtuples; BrinRevmap *revmap; BrinBuildState *state; Buffer meta; BlockNumber pagesPerRange; /* * We expect to be called exactly once for any index relation. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ meta = ReadBuffer(index, P_NEW); Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); MarkBufferDirty(meta); if (RelationNeedsWAL(index)) { xl_brin_createidx xlrec; XLogRecPtr recptr; Page page; xlrec.version = BRIN_CURRENT_VERSION; xlrec.pagesPerRange = BrinGetPagesPerRange(index); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); page = BufferGetPage(meta); PageSetLSN(page, recptr); } UnlockReleaseBuffer(meta); /* * Initialize our state, including the deformed tuple state. */ revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); state = initialize_brin_buildstate(index, revmap, pagesPerRange); /* * Now scan the relation. No syncscan allowed here because we want the * heap blocks in physical order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, brinbuildCallback, (void *) state, NULL); /* process the final batch */ form_and_insert_tuple(state); /* release resources */ idxtuples = state->bs_numtuples; brinRevmapTerminate(state->bs_rmAccess); terminate_brin_buildstate(state); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = idxtuples; return result; }
/* * Delete a posting tree page. */ static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) { Buffer dBuffer; Buffer lBuffer; Buffer pBuffer; Page page, parentPage; BlockNumber rightlink; /* * Lock the pages in the same order as an insertion would, to avoid * deadlocks: left, then right, then parent. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); /* Unlink the page by changing left sibling's rightlink */ page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do { PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif GinPageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); /* * we shouldn't change rightlink field to save workability of running * search scan */ GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(pBuffer); if (leftBlkno != InvalidBlockNumber) MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; XLogRecData rdata[4]; ginxlogDeletePage data; int n; data.node = gvs->index->rd_node; data.blkno = deleteBlkno; data.parentBlkno = parentBlkno; data.parentOffset = myoff; data.leftBlkno = leftBlkno; data.rightLink = GinPageGetOpaque(page)->rightlink; rdata[0].buffer = dBuffer; rdata[0].buffer_std = FALSE; rdata[0].data = NULL; rdata[0].len = 0; rdata[0].next = rdata + 1; rdata[1].buffer = pBuffer; rdata[1].buffer_std = FALSE; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = rdata + 2; if (leftBlkno != InvalidBlockNumber) { rdata[2].buffer = lBuffer; rdata[2].buffer_std = FALSE; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = rdata + 3; n = 3; } else n = 2; rdata[n].buffer = InvalidBuffer; rdata[n].buffer_std = FALSE; rdata[n].len = sizeof(ginxlogDeletePage); rdata[n].data = (char *) &data; rdata[n].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE, rdata); PageSetLSN(page, recptr); PageSetLSN(parentPage, recptr); if (leftBlkno != InvalidBlockNumber) { page = BufferGetPage(lBuffer); PageSetLSN(page, recptr); } } if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; }
/* * RelationTruncate * Physically truncate a relation to the specified number of blocks. * * This includes getting rid of any buffers for the blocks that are to be * dropped. */ void RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); /* * Make sure smgr_targblock etc aren't pointing somewhere past new end */ rel->rd_smgr->smgr_targblock = InvalidBlockNumber; rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber; rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; /* Truncate the FSM first if it exists */ fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); /* Truncate the visibility map too if it exists. */ vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); if (vm) visibilitymap_truncate(rel, nblocks); /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay * likely isn't going to succeed in the truncation either, and cause a * PANIC. It's tempting to put a critical section here, but that cure * would be worse than the disease. It would turn a usually harmless * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ if (RelationNeedsWAL(rel)) { /* * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; xl_smgr_truncate xlrec; xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; xlrec.flags = SMGR_TRUNCATE_ALL; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); /* * Flush, because otherwise the truncation of the main relation might * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still * contain entries for the non-existent heap pages. */ if (fsm || vm) XLogFlush(lsn); } /* Do the real work */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks); }
/* * Insert a new item to a page. * * Returns true if the insertion was finished. On false, the page was split and * the parent needs to be updated. (A root split returns true as it doesn't * need any further action by the caller to complete.) * * When inserting a downlink to an internal page, 'childbuf' contains the * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared * atomically with the insert. Also, the existing item at offset stack->off * in the target page is updated to point to updateblkno. * * stack->buffer is locked on entry, and is kept locked. * Likewise for childbuf, if given. */ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); bool result; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; Page newlpage = NULL, newrpage = NULL; void *ptp_workspace = NULL; MemoryContext tmpCxt; MemoryContext oldCxt; /* * We do all the work of this function and its subfunctions in a temporary * memory context. This avoids leakages and simplifies APIs, since some * subfunctions allocate storage that has to survive until we've finished * the WAL insertion. */ tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "ginPlaceToPage temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; if (GinPageIsLeaf(page)) { xlflags |= GIN_INSERT_ISLEAF; Assert(!BufferIsValid(childbuf)); Assert(updateblkno == InvalidBlockNumber); } else { Assert(BufferIsValid(childbuf)); Assert(updateblkno != InvalidBlockNumber); childpage = BufferGetPage(childbuf); } /* * See if the incoming tuple will fit on the page. beginPlaceToPage will * decide if the page needs to be split, and will compute the split * contents if so. See comments for beginPlaceToPage and execPlaceToPage * functions for more details of the API here. */ rc = btree->beginPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, &ptp_workspace, &newlpage, &newrpage); if (rc == GPTP_NO_WORK) { /* Nothing to do */ result = true; } else if (rc == GPTP_INSERT) { /* It will fit, perform the insertion */ START_CRIT_SECTION(); if (RelationNeedsWAL(btree->index)) { XLogBeginInsert(); XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD); if (BufferIsValid(childbuf)) XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); } /* Perform the page update, and register any extra WAL data */ btree->execPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, ptp_workspace); MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; ginxlogInsert xlrec; BlockIdData childblknos[2]; xlrec.flags = xlflags; XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); /* * Log information about child if this was an insertion of a * downlink. */ if (BufferIsValid(childbuf)) { BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); XLogRegisterData((char *) childblknos, sizeof(BlockIdData) * 2); } recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); PageSetLSN(page, recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* Insertion is complete. */ result = true; } else if (rc == GPTP_SPLIT) { /* * Didn't fit, need to split. The split has been computed in newlpage * and newrpage, which are pointers to palloc'd pages, not associated * with buffers. stack->buffer is not touched yet. */ Buffer rbuffer; BlockNumber savedRightLink; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; /* Get a new index page to become the right page */ rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } savedRightLink = GinPageGetOpaque(page)->rightlink; /* Begin setting up WAL record */ data.node = btree->index->rd_node; data.flags = xlflags; if (BufferIsValid(childbuf)) { data.leftChildBlkno = BufferGetBlockNumber(childbuf); data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } else data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; if (stack->parent == NULL) { /* * splitting the root, so we need to allocate new left page and * place pointers to left and right page on root page. */ lbuffer = GinNewBuffer(btree->index); /* During index build, count the new left page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } data.rrlink = InvalidBlockNumber; data.flags |= GIN_SPLIT_ROOT; GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); /* * Construct a new root page containing downlinks to the new left * and right pages. (Do this in a temporary copy rather than * overwriting the original page directly, since we're not in the * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); btree->fillRoot(btree, newrootpg, BufferGetBlockNumber(lbuffer), newlpage, BufferGetBlockNumber(rbuffer), newrpage); } else { /* splitting a non-root page */ data.rrlink = savedRightLink; GinPageGetOpaque(newrpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); } /* * OK, we have the new contents of the left page in a temporary copy * now (newlpage), and likewise for the new contents of the * newly-allocated right block. The original page is still unchanged. * * If this is a root split, we also have a temporary page containing * the new contents of the root. */ START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); /* * Restore the temporary copies over the real buffers. */ if (stack->parent == NULL) { /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); memcpy(page, newrootpg, BLCKSZ); memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } else { /* Normal split, only two pages to update */ memcpy(page, newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } /* write WAL record */ if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; XLogBeginInsert(); /* * We just take full page images of all the split pages. Splits * are uncommon enough that it's not worth complicating the code * to be more efficient. */ if (stack->parent == NULL) { XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); } else { XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); } if (BufferIsValid(childbuf)) XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD); XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* * We can release the locks/pins on the new pages now, but keep * stack->buffer locked. childbuf doesn't get unlocked either. */ UnlockReleaseBuffer(rbuffer); if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to * the parent. */ result = (stack->parent == NULL); } else { elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc); result = false; /* keep compiler quiet */ } /* Clean up temp context */ MemoryContextSwitchTo(oldCxt); MemoryContextDelete(tmpCxt); return result; }
/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if we don't get cleanup lock on old or * new bucket. * * Complete the pending splits and remove the tuples from old bucket, * if there are any left over from the previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */ void _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; Buffer buf_nblkno; Buffer buf_oblkno; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; bool metap_update_masks = false; bool metap_update_splitpoint = false; restart_expand: /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in * _hash_init(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to take cleanup lock * on the old bucket. If we can't get the lock, give up. * * The cleanup lock protects us not only against other backends, but * against our own backend as well. * * The cleanup lock is mainly to protect the split from concurrent * inserts. See src/backend/access/hash/README, Lock Definitions for * further details. Due to this locking restriction, if there is any * pending scan, the split will give up which is not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); if (!buf_oblkno) goto fail; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * We want to finish the split from a bucket as there is no apparent * benefit by not doing so and it will make the code complicated to finish * the split that involves multiple buckets considering the case where new * split also fails. We don't need to consider the new bucket for * completing the split here as it is not possible that a re-split of new * bucket starts when there is still a pending split from old bucket. */ if (H_BUCKET_BEING_SPLIT(oopaque)) { /* * Copy bucket mapping info now; refer the comment in code below where * we copy this information before calling _hash_splitbucket to see * why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* * Release the lock on metapage and old_bucket, before completing the * split. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, highmask, lowmask); /* release the pin on old buffer and retry for expand. */ _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * Clean the tuples remained from the previous split. This operation * requires cleanup lock and we already have one on the old bucket, so * let's do it. We also don't want to allow further splits from the bucket * till the garbage of previous split is cleaned. This has two * advantages; first, it helps in avoiding the bloat due to garbage and * second is, during cleanup of bucket, we are always sure that the * garbage tuples belong to most recently split bucket. On the contrary, * if we allow cleanup of bucket after meta page is updated to indicate * the new split and before the actual split, the cleanup operation won't * be able to decide whether the tuple has been moved to the newly created * bucket and ended up deleting such tuples. */ if (H_NEEDS_SPLIT_CLEANUP(oopaque)) { /* * Copy bucket mapping info now; refer to the comment in code below * where we copy this information before calling _hash_splitbucket to * see why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Release the metapage lock. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); /* * If the split point is increasing we need to allocate a new batch of * bucket pages. */ spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { uint32 buckets_to_add; Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * We treat allocation of buckets as a separate WAL-logged action. * Even if we fail after this operation, won't leak bucket pages; * rather, the next split will consume this space. In any case, even * without failure we don't use all the space in one split operation. */ buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) { /* can't split due to BlockNumber overflow */ _hash_relbuf(rel, buf_oblkno); goto fail; } } /* * Physically allocate the new bucket's primary page. We want to do this * before changing the metapage's mapping info, in case we can't get the * disk space. Ideally, we don't need to check for cleanup lock on new * bucket as no other backend could find this bucket unless meta page is * updated. However, it is good to be consistent with old bucket locking. */ buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); if (!IsBufferCleanupOK(buf_nblkno)) { _hash_relbuf(rel, buf_oblkno); _hash_relbuf(rel, buf_nblkno); goto fail; } /* * Since we are scribbling on the pages in the shared buffers, establish a * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. */ START_CRIT_SECTION(); /* * Okay to proceed with split. Update the metapage bucket mapping info. */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; metap_update_masks = true; } /* * If the split point is increasing we need to adjust the hashm_spares[] * array and hashm_ovflpoint so that future overflow pages will be created * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; metap_update_splitpoint = true; } MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * Mark the old bucket to indicate that split is in progress. (At * operation end, we will clear the split-in-progress flag.) Also, for a * primary bucket page, hasho_prevblkno stores the number of buckets that * existed as of the last split, so we must update that value here. */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; oopaque->hasho_prevblkno = maxbucket; MarkBufferDirty(buf_oblkno); npage = BufferGetPage(buf_nblkno); /* * initialize the new bucket's primary page and mark it to indicate that * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = new_bucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; nopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(buf_nblkno); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_split_allocate_page xlrec; XLogRecPtr recptr; xlrec.new_bucket = maxbucket; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; xlrec.flags = 0; XLogBeginInsert(); XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); if (metap_update_masks) { xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); } if (metap_update_splitpoint) { xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_spares[metap->hashm_ovflpoint], sizeof(uint32)); } XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); PageSetLSN(BufferGetPage(buf_oblkno), recptr); PageSetLSN(BufferGetPage(buf_nblkno), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, buf_oblkno, buf_nblkno, NULL, maxbucket, highmask, lowmask); /* all done, now release the pins on primary buckets. */ _hash_dropbuf(rel, buf_oblkno); _hash_dropbuf(rel, buf_nblkno); return; /* Here if decide not to split or fail to acquire old bucket lock */ fail: /* We didn't write the metapage, so just drop lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * This routine is used to partition the tuples between old and new bucket and * is used to finish the incomplete split operations. To finish the previously * interrupted split operation, the caller needs to fill htab. If htab is set, * then we skip the movement of tuples that exists in htab, otherwise NULL * value of htab indicates movement of all the tuples that belong to the new * bucket. * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * * Split needs to retain pin on primary bucket pages of both old and new * buckets till end of operation. This is to prevent vacuum from starting * while a split is in progress. * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. The lock will be * released here and pin must be released by the caller. (The API is set up * this way because we must do _hash_getnewbuf() before releasing the metapage * write lock. So instead of passing the new bucket's start block number, we * pass an actual buffer.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; IndexTuple itups[MaxIndexTuplesPerPage]; Size all_tups_size = 0; int i; uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; bool found = false; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) continue; /* * Before inserting a tuple, probe the hash table containing TIDs * of tuples belonging to new bucket, if we find a match, then * skip that tuple, else fetch the item's hash key (conveniently * stored in the item) and determine which bucket it now belongs * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); if (htab) (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); if (found) continue; bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { IndexTuple new_itup; /* * make a copy of index tuple as we have to scribble on it. */ new_itup = CopyIndexTuple(itup); /* * mark the index tuple as moved by split, such tuples are * skipped by scan if there is split in progress for a bucket. */ new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { /* * Change the shared buffer state in critical section, * otherwise any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } itups[nitups++] = new_itup; all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* retain the pin on the old primary bucket */ if (obuf == bucket_obuf) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { /* * Change the shared buffer state in critical section, otherwise * any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); break; } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Mark the old and new buckets to indicate split is * finished. * * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); START_CRIT_SECTION(); oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; /* * After the split is finished, mark the old bucket to indicate that it * contains deletable tuples. We will clear split-cleanup flag after * deleting such tuples either at the end of split or at the next split * from old bucket or at the time of vacuum. */ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; /* * now write the buffers, here we don't release the locks as caller is * responsible to release locks. */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_split_complete xlrec; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); PageSetLSN(BufferGetPage(bucket_obuf), recptr); PageSetLSN(BufferGetPage(bucket_nbuf), recptr); } END_CRIT_SECTION(); /* * If possible, clean up the old bucket. We might not be able to do this * if someone else has a pin on it, but if not then we can go ahead. This * isn't absolutely necessary, but it reduces bloat; if we don't do it * now, VACUUM will do it eventually, but maybe not until new overflow * pages have been allocated. Note that there's no need to clean up the * new bucket. */ if (IsBufferCleanupOK(bucket_obuf)) { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, obucket, bucket_obuf, BufferGetBlockNumber(bucket_obuf), NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); } else { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); } }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; BrinSpecialSpace *special; bool extended = false; newsz = MAXALIGN(newsz); /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); /* XXX delay vacuuming FSM until locks are released? */ if (extended) FreeSpaceMapVacuum(idxrel); if (!BufferIsValid(newbuf)) return false; /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) newbuf = InvalidBuffer; } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); return false; } special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage); /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((special->flags & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); START_CRIT_SECTION(); PageIndexDeleteNoCompact(oldpage, &oldoff, 1); if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true, false) == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); PageIndexDeleteNoCompact(oldpage, &oldoff, 1); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); return true; } }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; Page page; XLogBeginInsert(); XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT); XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is used to hold data not yet * dumped out to the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * create a temporary memory context that is used for calling * ginExtractEntries(), and can be reset after each tuple */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.funcCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * Delete a posting tree page. */ static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) { Buffer dBuffer; Buffer lBuffer; Buffer pBuffer; Page page, parentPage; BlockNumber rightlink; /* * Lock the pages in the same order as an insertion would, to avoid * deadlocks: left, then right, then parent. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); /* Unlink the page by changing left sibling's rightlink */ page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do { PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif GinPageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); /* * we shouldn't change rightlink field to save workability of running * search scan */ GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(pBuffer); MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; XLogRecData rdata[4]; ginxlogDeletePage data; data.node = gvs->index->rd_node; data.blkno = deleteBlkno; data.parentBlkno = parentBlkno; data.parentOffset = myoff; data.leftBlkno = leftBlkno; data.rightLink = GinPageGetOpaque(page)->rightlink; /* * We can't pass buffer_std = TRUE, because we didn't set pd_lower on * pre-9.4 versions. The page might've been binary-upgraded from an * older version, and hence not have pd_lower set correctly. Ditto for * the left page, but removing the item from the parent updated its * pd_lower, so we know that's OK at this point. */ rdata[0].buffer = dBuffer; rdata[0].buffer_std = FALSE; rdata[0].data = NULL; rdata[0].len = 0; rdata[0].next = rdata + 1; rdata[1].buffer = pBuffer; rdata[1].buffer_std = TRUE; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = rdata + 2; rdata[2].buffer = lBuffer; rdata[2].buffer_std = FALSE; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = rdata + 3; rdata[3].buffer = InvalidBuffer; rdata[3].buffer_std = FALSE; rdata[3].len = sizeof(ginxlogDeletePage); rdata[3].data = (char *) &data; rdata[3].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE, rdata); PageSetLSN(page, recptr); PageSetLSN(parentPage, recptr); PageSetLSN(BufferGetPage(lBuffer), recptr); } if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); has_dead_tuples = false; /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* * Release any remaining pin on visibility map page. */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * Create a WAL record for vacuuming entry tree leaf page. */ static void xlogVacuumPage(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); XLogRecPtr recptr; XLogRecData rdata[3]; ginxlogVacuumPage xlrec; uint16 lower; uint16 upper; /* This is only used for entry tree leaf pages. */ Assert(!GinPageIsData(page)); Assert(GinPageIsLeaf(page)); if (!RelationNeedsWAL(index)) return; xlrec.node = index->rd_node; xlrec.blkno = BufferGetBlockNumber(buffer); /* Assume we can omit data between pd_lower and pd_upper */ lower = ((PageHeader) page)->pd_lower; upper = ((PageHeader) page)->pd_upper; Assert(lower < BLCKSZ); Assert(upper < BLCKSZ); if (lower >= SizeOfPageHeaderData && upper > lower && upper <= BLCKSZ) { xlrec.hole_offset = lower; xlrec.hole_length = upper - lower; } else { /* No "hole" to compress out */ xlrec.hole_offset = 0; xlrec.hole_length = 0; } rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(ginxlogVacuumPage); rdata[0].buffer = InvalidBuffer; rdata[0].next = &rdata[1]; if (xlrec.hole_length == 0) { rdata[1].data = (char *) page; rdata[1].len = BLCKSZ; rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; } else { /* must skip the hole */ rdata[1].data = (char *) page; rdata[1].len = xlrec.hole_offset; rdata[1].buffer = InvalidBuffer; rdata[1].next = &rdata[2]; rdata[2].data = (char *) page + (xlrec.hole_offset + xlrec.hole_length); rdata[2].len = BLCKSZ - (xlrec.hole_offset + xlrec.hole_length); rdata[2].buffer = InvalidBuffer; rdata[2].next = NULL; } recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE, rdata); PageSetLSN(page, recptr); }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. */ static void shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; Assert(!GinPageIsDeleted(page)); nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c * compresses the page. (We must do this here because pre-v11 * versions of PG did not set the metapage's pd_lower correctly, so a * pg_upgraded index might contain the wrong value.) */ ((PageHeader) metapage)->pd_lower = ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* * Truncation of a relation is WAL-logged at a higher-level, and we * will be called at WAL replay. But if checksums are enabled, we need * to still write a WAL record to protect against a torn page, if the * page is flushed to disk before the truncation WAL record. We cannot * use MarkBufferDirtyHint here, because that will not dirty the page * during recovery. */ MarkBufferDirty(buf); if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); END_CRIT_SECTION(); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; /* * Update upper-level FSM pages to account for the truncation. This is * important because the just-truncated pages were likely marked as * all-free, and would be preferentially selected. */ FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); }
/* * Insert value (stored in GinBtree) to tree described by stack * * During an index build, buildStats is non-null and the counters * it contains should be incremented as needed. * * NB: the passed-in stack is freed, as though by freeGinBtreeStack. */ void ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { GinBtreeStack *parent = stack; BlockNumber rootBlkno = InvalidBuffer; Page page, rpage, lpage; /* remember root BlockNumber */ while (parent) { rootBlkno = parent->blkno; parent = parent->parent; } while (stack) { XLogRecData *rdata; BlockNumber savedRightLink; page = BufferGetPage(stack->buffer); savedRightLink = GinPageGetOpaque(page)->rightlink; if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) { START_CRIT_SECTION(); btree->placeToPage(btree, stack->buffer, stack->off, &rdata); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); return; } else { Buffer rbuffer = GinNewBuffer(btree->index); Page newlpage; /* * newlpage is a pointer to memory page, it doesn't associate with * buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; /* During index build, count the newly-split page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } parent = stack->parent; if (parent == NULL) { /* * split root, so we need to allocate new left page and place * pointer on root to left and right page */ Buffer lbuffer = GinNewBuffer(btree->index); ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; page = BufferGetPage(stack->buffer); lpage = BufferGetPage(lbuffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer); START_CRIT_SECTION(); GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF); PageRestoreTempPage(newlpage, lpage); btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer); MarkBufferDirty(rbuffer); MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); /* During index build, count the newly-added root page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } return; } else { /* split non-root page */ ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE; ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink; lpage = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); START_CRIT_SECTION(); PageRestoreTempPage(newlpage, lpage); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); END_CRIT_SECTION(); } } btree->isDelete = FALSE; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; LockBuffer(parent->buffer, GIN_UNLOCK); if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ ginFindParents(btree, stack, rootBlkno); parent = stack->parent; page = BufferGetPage(parent->buffer); break; } parent->blkno = rightlink; parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); LockBuffer(parent->buffer, GIN_EXCLUSIVE); page = BufferGetPage(parent->buffer); } UnlockReleaseBuffer(stack->buffer); pfree(stack); stack = parent; } }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples after crash recovery. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->tuples_removed += 1; } else stats->num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate, Relation heapRel, bool is_build) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; state.heapRel = heapRel; state.is_build = is_build; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = xlocked ? PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; /* currently, internal pages are never deleted */ Assert(!GistPageIsDeleted(stack->page)); downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* * The page might have been deleted after we scanned the parent * and saw the downlink. */ if (GistPageIsDeleted(stack->page)) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ Datum gistbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * We can't yet handle unlogged GiST indexes, because we depend on LSNs. * This is duplicative of an error in gistbuildempty, but we want to check * here so as to throw error before doing all the index-build work. */ if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unlogged GiST indexes are not supported"))); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * serverid if it's a foreign table, the server OID * fdwroutine if it's a foreign table, the FDW function pointers * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* Retrive the parallel_degree reloption, if set. */ rel->rel_parallel_degree = RelationGetParallelDegree(relation, -1); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemRelation(relation))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexAmRoutine *amroutine; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes, if they're marked * IndexIsReady. */ if (!IndexIsValid(index)) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); info->canreturn = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; info->canreturn[i] = index_can_return(indexRelation, i + 1); } info->relam = indexRelation->rd_rel->relam; /* We copy just the fields we need, not all of rd_amroutine */ amroutine = indexRelation->rd_amroutine; info->amcanorderbyop = amroutine->amcanorderbyop; info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = (amroutine->amgetbitmap != NULL); info->amcostestimate = amroutine->amcostestimate; Assert(info->amcostestimate != NULL); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(amroutine->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (amroutine->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->indrestrictinfo = NIL; /* set later, in indxpath.c */ info->predOK = false; /* set later, in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } if (info->relam == BTREE_AM_OID) { /* For btrees, get tree height while we have the index open */ info->tree_height = _bt_getrootheight(indexRelation); } else { /* For other index types, just set it to "unknown" for now */ info->tree_height = -1; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * Insert an index tuple into the index relation. The revmap is updated to * mark the range containing the given page as pointing to the inserted entry. * A WAL record is written. * * The buffer, if valid, is first checked for free space to insert the new * entry; if there isn't enough, a new buffer is obtained and pinned. No * buffer lock must be held on entry, no buffer lock is held on exit. * * Return value is the offset number where the tuple was inserted. */ OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz) { Page page; BlockNumber blk; OffsetNumber off; Buffer revmapbuf; ItemPointerData tid; bool extended = false; itemsz = MAXALIGN(itemsz); /* Make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); /* * Obtain a locked buffer to insert the new tuple. Note * brin_getinsertbuffer ensures there's enough space in the returned * buffer. */ if (BufferIsValid(*buffer)) { /* * It's possible that another backend (or ourselves!) extended the * revmap over the page we held a pin on, so we cannot assume that * it's still a regular page. */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) { UnlockReleaseBuffer(*buffer); *buffer = InvalidBuffer; } } if (!BufferIsValid(*buffer)) { *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); Assert(BufferIsValid(*buffer)); Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz); } /* Now obtain lock on revmap buffer */ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); page = BufferGetPage(*buffer); blk = BufferGetBlockNumber(*buffer); START_CRIT_SECTION(); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) elog(ERROR, "could not insert new index tuple to page"); MarkBufferDirty(*buffer); BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", blk, off, heapBlk)); ItemPointerSet(&tid, blk, off); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_insert xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.heapBlk = heapBlk; xlrec.pagesPerRange = pagesPerRange; xlrec.offnum = off; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) tup, itemsz); XLogRegisterBuffer(1, revmapbuf, 0); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); /* Tuple is firmly on buffer; we can release our locks */ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return off; }
/* * Creates new posting tree containing the given TIDs. Returns the page * number of the root of the new posting tree. * * items[] must be in sorted order with no duplicates. */ BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, GinStatsData *buildStats) { BlockNumber blkno; Buffer buffer; Page page; int nrootitems; /* Calculate how many TIDs will fit on first page. */ nrootitems = Min(nitems, GinMaxLeafDataItems); /* * Create the root page. */ buffer = GinNewBuffer(index); page = BufferGetPage(buffer); blkno = BufferGetBlockNumber(buffer); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nrootitems); GinPageGetOpaque(page)->maxoff = nrootitems; MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata[2]; ginxlogCreatePostingTree data; data.node = index->rd_node; data.blkno = blkno; data.nitem = nrootitems; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogCreatePostingTree); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; rdata[1].data = (char *) items; rdata[1].len = sizeof(ItemPointerData) * nrootitems; rdata[1].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); PageSetLSN(page, recptr); } UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* During index build, count the newly-added data page */ if (buildStats) buildStats->nDataPages++; /* * Add any remaining TIDs to the newly-created posting tree. */ if (nitems > nrootitems) { ginInsertItemPointers(index, blkno, items + nrootitems, nitems - nrootitems, buildStats); } return blkno; }
/* * _bt_getroot() -- Get the root page of the btree. * * Since the root page can move around the btree file, we have to read * its location from the metadata page, and then read the root page * itself. If no root page exists yet, we have to create one. The * standard class of race conditions exists here; I think I covered * them all in the Hopi Indian rain dance of lock requests below. * * The access type parameter (BT_READ or BT_WRITE) controls whether * a new root page will be created or not. If access = BT_READ, * and no root page exists, we just return InvalidBuffer. For * BT_WRITE, we try to create the root page if it doesn't exist. * NOTE that the returned root page will have only a read lock set * on it even if access = BT_WRITE! * * The returned page is not necessarily the true root --- it could be * a "fast root" (a page that is alone in its level due to deletions). * Also, if the root page is split while we are "in flight" to it, * what we will return is the old root, which is now just the leftmost * page on a probably-not-very-wide level. For most purposes this is * as good as or better than the true root, so we do not bother to * insist on finding the true root. We do, however, guarantee to * return a live (not deleted or half-dead) page. * * On successful return, the root page is pinned and read-locked. * The metadata page is not locked or pinned on exit. */ Buffer _bt_getroot(Relation rel, int access) { Buffer metabuf; Page metapg; BTPageOpaque metaopaque; Buffer rootbuf; Page rootpage; BTPageOpaque rootopaque; BlockNumber rootblkno; uint32 rootlevel; BTMetaPageData *metad; /* * Try to use previously-cached metapage data to find the root. This * normally saves one buffer access per index search, which is a very * helpful savings in bufmgr traffic and hence contention. */ if (rel->rd_amcache != NULL) { metad = (BTMetaPageData *) rel->rd_amcache; /* We shouldn't have cached it if any of these fail */ Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version == BTREE_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); /* * Since the cache might be stale, we check the page more carefully * here than normal. We *must* check that it's not deleted. If it's * not alone on its level, then we reject too --- this may be overly * paranoid but better safe than sorry. Note we don't check P_ISROOT, * because that's not set in a "fast root". */ if (!P_IGNORE(rootopaque) && rootopaque->btpo.level == rootlevel && P_LEFTMOST(rootopaque) && P_RIGHTMOST(rootopaque)) { /* OK, accept cached page as the root */ return rootbuf; } _bt_relbuf(rel, rootbuf); /* Cache is stale, throw it away */ if (rel->rd_amcache) pfree(rel->rd_amcache); rel->rd_amcache = NULL; } metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); metad = BTPageGetMeta(metapg); /* sanity-check the metapage */ if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d, code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) { /* If access = BT_READ, caller doesn't want us to create root yet */ if (access == BT_READ) { _bt_relbuf(rel, metabuf); return InvalidBuffer; } /* trade in our read lock for a write lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we * must avoid doing it again. */ if (metad->btm_root != P_NONE) { /* * Metadata initialized by someone else. In order to guarantee no * deadlocks, we have to release the metadata page and start all * over again. (Is that really true? But it's hardly worth trying * to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* * Get, initialize, write, and leave a lock of the appropriate type on * the new root page. Since this is the first page in the tree, it's * a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); rootopaque->btpo.level = 0; rootopaque->btpo_cycleid = 0; /* NO ELOG(ERROR) till meta is updated */ START_CRIT_SECTION(); metad->btm_root = rootblkno; metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; XLogRecPtr recptr; XLogRecData rdata; xlrec.node = rel->rd_node; xlrec.rootblk = rootblkno; xlrec.level = 0; rdata.data = (char *) &xlrec; rdata.len = SizeOfBtreeNewroot; rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata); PageSetLSN(rootpage, recptr); PageSetTLI(rootpage, ThisTimeLineID); PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } END_CRIT_SECTION(); /* * Send out relcache inval for metapage change (probably unnecessary * here, but let's be safe). */ CacheInvalidateRelcache(rel); /* * swap root write lock for read lock. There is no danger of anyone * else accessing the new root page while it's unlocked, since no one * else knows where it is yet. */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); /* okay, metadata is correct, release lock on it */ _bt_relbuf(rel, metabuf); } else { rootblkno = metad->btm_fastroot; Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; /* * Cache the metapage data for next time */ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, sizeof(BTMetaPageData)); memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); /* * We are done with the metapage; arrange to release it via first * _bt_relandgetbuf call */ rootbuf = metabuf; for (;;) { rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); if (!P_IGNORE(rootopaque)) break; /* it's dead, Jim. step right one page */ if (P_RIGHTMOST(rootopaque)) elog(ERROR, "no live root page found in index \"%s\"", RelationGetRelationName(rel)); rootblkno = rootopaque->btpo_next; } /* Note: can't check btpo.level on deleted pages */ if (rootopaque->btpo.level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), rootopaque->btpo.level, rootlevel); } /* * By here, we have a pin and read lock on the root page, and no lock set * on the metadata page. Return the root page's buffer. */ return rootbuf; }