/* * Try to find parent for current stack position. Returns correct parent and * child's offset in stack->parent. The root page is never released, to * to prevent conflict with vacuum process. */ static void ginFindParents(GinBtree btree, GinBtreeStack *stack) { Page page; Buffer buffer; BlockNumber blkno, leftmostBlkno; OffsetNumber offset; GinBtreeStack *root; GinBtreeStack *ptr; /* * Unwind the stack all the way up to the root, leaving only the root * item. * * Be careful not to release the pin on the root page! The pin on root * page is required to lock out concurrent vacuums on the tree. */ root = stack->parent; while (root->parent) { ReleaseBuffer(root->buffer); root = root->parent; } Assert(root->blkno == btree->rootBlkno); Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno); root->off = InvalidOffsetNumber; blkno = root->blkno; buffer = root->buffer; offset = InvalidOffsetNumber; ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); for (;;) { LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); if (GinPageIsLeaf(page)) elog(ERROR, "Lost path"); if (GinPageIsIncompleteSplit(page)) { Assert(blkno != btree->rootBlkno); ptr->blkno = blkno; ptr->buffer = buffer; /* * parent may be wrong, but if so, the ginFinishSplit call will * recurse to call ginFindParents again to fix it. */ ptr->parent = root; ptr->off = InvalidOffsetNumber; ginFinishSplit(btree, ptr, false, NULL); } leftmostBlkno = btree->getLeftMostChild(btree, page); while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) { blkno = GinPageGetOpaque(page)->rightlink; if (blkno == InvalidBlockNumber) { UnlockReleaseBuffer(buffer); break; } buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); page = BufferGetPage(buffer); /* finish any incomplete splits, as above */ if (GinPageIsIncompleteSplit(page)) { Assert(blkno != btree->rootBlkno); ptr->blkno = blkno; ptr->buffer = buffer; ptr->parent = root; ptr->off = InvalidOffsetNumber; ginFinishSplit(btree, ptr, false, NULL); } } if (blkno != InvalidBlockNumber) { ptr->blkno = blkno; ptr->buffer = buffer; ptr->parent = root; /* it may be wrong, but in next call we will * correct */ ptr->off = offset; stack->parent = ptr; return; } /* Descend down to next level */ blkno = leftmostBlkno; buffer = ReadBuffer(btree->index, blkno); } }
/* * Recursive guts of FreeSpaceMapVacuum * * Examine the FSM page indicated by addr, as well as its children, updating * upper-level nodes that cover the heap block range from start to end-1. * (It's okay if end is beyond the actual end of the map.) * Return the maximum freespace value on this page. * * If addr is past the end of the FSM, set *eof_p to true and return 0. * * This traverses the tree in depth-first order. The tree is stored * physically in depth-first order, so this should be pretty I/O efficient. */ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof_p) { Buffer buf; Page page; uint8 max_avail; /* Read the page if it exists, or return EOF */ buf = fsm_readbuf(rel, addr, false); if (!BufferIsValid(buf)) { *eof_p = true; return 0; } else *eof_p = false; page = BufferGetPage(buf); /* * If we're above the bottom level, recurse into children, and fix the * information stored about them at this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { FSMAddress fsm_start, fsm_end; uint16 fsm_start_slot, fsm_end_slot; int slot, start_slot, end_slot; bool eof = false; /* * Compute the range of slots we need to update on this page, given * the requested range of heap blocks to consider. The first slot to * update is the one covering the "start" block, and the last slot is * the one covering "end - 1". (Some of this work will be duplicated * in each recursive call, but it's cheap enough to not worry about.) */ fsm_start = fsm_get_location(start, &fsm_start_slot); fsm_end = fsm_get_location(end - 1, &fsm_end_slot); while (fsm_start.level < addr.level) { fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot); fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot); } Assert(fsm_start.level == addr.level); if (fsm_start.logpageno == addr.logpageno) start_slot = fsm_start_slot; else if (fsm_start.logpageno > addr.logpageno) start_slot = SlotsPerFSMPage; /* shouldn't get here... */ else start_slot = 0; if (fsm_end.logpageno == addr.logpageno) end_slot = fsm_end_slot; else if (fsm_end.logpageno > addr.logpageno) end_slot = SlotsPerFSMPage - 1; else end_slot = -1; /* shouldn't get here... */ for (slot = start_slot; slot <= end_slot; slot++) { int child_avail; CHECK_FOR_INTERRUPTS(); /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), start, end, &eof); else child_avail = 0; /* Update information about the child */ if (fsm_get_avail(page, slot) != child_avail) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(page, slot, child_avail); MarkBufferDirtyHint(buf, false); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } } /* Now get the maximum value on the page, to return to caller */ max_avail = fsm_get_max_avail(page); /* * Reset the next slot pointer. This encourages the use of low-numbered * pages, increasing the chances that a later vacuum can truncate the * relation. We don't bother with a lock here, nor with marking the page * dirty if it wasn't already, since this is just a hint. */ ((FSMPage) PageGetContents(page))->fp_next_slot = 0; ReleaseBuffer(buf); return max_avail; }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to curent bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing LH_PAGE_HAS_DEAD_TUPLES * flag. */ if (tuples_removed && *tuples_removed > 0 && opaque->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * Read a FSM page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the FSM file is extended. */ static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the FSM yet, check it first. Also * recheck if the requested block seems to be past end, since our cached * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); else rel->rd_smgr->smgr_fsm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (extend) fsm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM * information is not accurate anyway, so it's better to clear corrupt * pages than error out. Since the FSM changes are not WAL-logged, the * so-called torn page problem on crash can lead to pages with corrupt * headers, for example. * * The initialize-the-page part is trickier than it looks, because of the * possibility of multiple backends doing this concurrently, and our * desire to not uselessly take the buffer lock in the normal path where * the page is OK. We must take the lock to initialize the page, so * recheck page newness after we have the lock, in case someone else * already did it. Also, because we initially check PageIsNew with no * lock, it's possible to fall through and return the buffer while someone * else is still initializing the page (i.e., we might see pd_upper as set * but other page header fields are still zeroes). This is harmless for * callers that will take a buffer lock themselves, but some callers * inspect the page without any lock at all. The latter is OK only so * long as it doesn't depend on the page header having correct contents. * Current usage is safe because PageGetContents() does not require that. */ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; }
/* * Insert all matching tuples into a bitmap. */ int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { int64 ntids = 0; BlockNumber blkno = BLOOM_HEAD_BLKNO, npages; int i; BufferAccessStrategy bas; BloomScanOpaque so = (BloomScanOpaque) scan->opaque; if (so->sign == NULL) { /* New search: have to calculate search signature */ ScanKey skey = scan->keyData; so->sign = palloc0(sizeof(BloomSignatureWord) * so->state.opts.bloomLength); for (i = 0; i < scan->numberOfKeys; i++) { /* * Assume bloom-indexable operators to be strict, so nothing could * be found for NULL key. */ if (skey->sk_flags & SK_ISNULL) { pfree(so->sign); so->sign = NULL; return 0; } /* Add next value to the signature */ signValue(&so->state, so->sign, skey->sk_argument, skey->sk_attno - 1); skey++; } } /* * We're going to read the whole index. This is why we use appropriate * buffer access strategy. */ bas = GetAccessStrategy(BAS_BULKREAD); npages = RelationGetNumberOfBlocks(scan->indexRelation); for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, blkno, RBM_NORMAL, bas); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); if (!PageIsNew(page) && !BloomPageIsDeleted(page)) { OffsetNumber offset, maxOffset = BloomPageGetMaxOffset(page); for (offset = 1; offset <= maxOffset; offset++) { BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset); bool res = true; /* Check index signature with scan signature */ for (i = 0; i < so->state.opts.bloomLength; i++) { if ((itup->sign[i] & so->sign[i]) != so->sign[i]) { res = false; break; } } /* Add matching tuples to bitmap */ if (res) { tbm_add_tuples(tbm, &itup->heapPtr, 1, true); ntids++; } } } UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } FreeAccessStrategy(bas); return ntids; }
/* * hashgettuple() -- Get the next tuple in the scan. */ bool hashgettuple(IndexScanDesc scan, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold a pin on the * primary bucket page, no deletions or splits could have occurred; * therefore we can expect that the TID still exists in the current * index page, at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); /* * We don't need test for old snapshot here as the current buffer is * pinned, so vacuum can't clean the page. */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so remember it for later. (We'll deal with all such * tuples at once right after leaving the index page or at * end of scan.) In case if caller reverses the indexscan * direction it is quite possible that the same item might * get entered multiple times. But, we don't detect that; * instead, we just forget any excess entries. */ if (so->killedItems == NULL) so->killedItems = palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem)); if (so->numKilled < MaxIndexTuplesPerPage) { so->killedItems[so->numKilled].heapTid = so->hashso_heappos; so->killedItems[so->numKilled].indexOffset = ItemPointerGetOffsetNumber(&(so->hashso_curpos)); so->numKilled++; } } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; return res; }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if we don't get cleanup lock on old or * new bucket. * * Complete the pending splits and remove the tuples from old bucket, * if there are any left over from the previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */ void _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; Buffer buf_nblkno; Buffer buf_oblkno; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; bool metap_update_masks = false; bool metap_update_splitpoint = false; restart_expand: /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in * _hash_init(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to take cleanup lock * on the old bucket. If we can't get the lock, give up. * * The cleanup lock protects us not only against other backends, but * against our own backend as well. * * The cleanup lock is mainly to protect the split from concurrent * inserts. See src/backend/access/hash/README, Lock Definitions for * further details. Due to this locking restriction, if there is any * pending scan, the split will give up which is not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); if (!buf_oblkno) goto fail; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * We want to finish the split from a bucket as there is no apparent * benefit by not doing so and it will make the code complicated to finish * the split that involves multiple buckets considering the case where new * split also fails. We don't need to consider the new bucket for * completing the split here as it is not possible that a re-split of new * bucket starts when there is still a pending split from old bucket. */ if (H_BUCKET_BEING_SPLIT(oopaque)) { /* * Copy bucket mapping info now; refer the comment in code below where * we copy this information before calling _hash_splitbucket to see * why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* * Release the lock on metapage and old_bucket, before completing the * split. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, highmask, lowmask); /* release the pin on old buffer and retry for expand. */ _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * Clean the tuples remained from the previous split. This operation * requires cleanup lock and we already have one on the old bucket, so * let's do it. We also don't want to allow further splits from the bucket * till the garbage of previous split is cleaned. This has two * advantages; first, it helps in avoiding the bloat due to garbage and * second is, during cleanup of bucket, we are always sure that the * garbage tuples belong to most recently split bucket. On the contrary, * if we allow cleanup of bucket after meta page is updated to indicate * the new split and before the actual split, the cleanup operation won't * be able to decide whether the tuple has been moved to the newly created * bucket and ended up deleting such tuples. */ if (H_NEEDS_SPLIT_CLEANUP(oopaque)) { /* * Copy bucket mapping info now; refer to the comment in code below * where we copy this information before calling _hash_splitbucket to * see why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Release the metapage lock. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); /* * If the split point is increasing we need to allocate a new batch of * bucket pages. */ spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { uint32 buckets_to_add; Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * We treat allocation of buckets as a separate WAL-logged action. * Even if we fail after this operation, won't leak bucket pages; * rather, the next split will consume this space. In any case, even * without failure we don't use all the space in one split operation. */ buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) { /* can't split due to BlockNumber overflow */ _hash_relbuf(rel, buf_oblkno); goto fail; } } /* * Physically allocate the new bucket's primary page. We want to do this * before changing the metapage's mapping info, in case we can't get the * disk space. Ideally, we don't need to check for cleanup lock on new * bucket as no other backend could find this bucket unless meta page is * updated. However, it is good to be consistent with old bucket locking. */ buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); if (!IsBufferCleanupOK(buf_nblkno)) { _hash_relbuf(rel, buf_oblkno); _hash_relbuf(rel, buf_nblkno); goto fail; } /* * Since we are scribbling on the pages in the shared buffers, establish a * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. */ START_CRIT_SECTION(); /* * Okay to proceed with split. Update the metapage bucket mapping info. */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; metap_update_masks = true; } /* * If the split point is increasing we need to adjust the hashm_spares[] * array and hashm_ovflpoint so that future overflow pages will be created * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; metap_update_splitpoint = true; } MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * Mark the old bucket to indicate that split is in progress. (At * operation end, we will clear the split-in-progress flag.) Also, for a * primary bucket page, hasho_prevblkno stores the number of buckets that * existed as of the last split, so we must update that value here. */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; oopaque->hasho_prevblkno = maxbucket; MarkBufferDirty(buf_oblkno); npage = BufferGetPage(buf_nblkno); /* * initialize the new bucket's primary page and mark it to indicate that * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = new_bucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; nopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(buf_nblkno); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_split_allocate_page xlrec; XLogRecPtr recptr; xlrec.new_bucket = maxbucket; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; xlrec.flags = 0; XLogBeginInsert(); XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); if (metap_update_masks) { xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); } if (metap_update_splitpoint) { xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_spares[metap->hashm_ovflpoint], sizeof(uint32)); } XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); PageSetLSN(BufferGetPage(buf_oblkno), recptr); PageSetLSN(BufferGetPage(buf_nblkno), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, buf_oblkno, buf_nblkno, NULL, maxbucket, highmask, lowmask); /* all done, now release the pins on primary buckets. */ _hash_dropbuf(rel, buf_oblkno); _hash_dropbuf(rel, buf_nblkno); return; /* Here if decide not to split or fail to acquire old bucket lock */ fail: /* We didn't write the metapage, so just drop lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * This routine is used to partition the tuples between old and new bucket and * is used to finish the incomplete split operations. To finish the previously * interrupted split operation, the caller needs to fill htab. If htab is set, * then we skip the movement of tuples that exists in htab, otherwise NULL * value of htab indicates movement of all the tuples that belong to the new * bucket. * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * * Split needs to retain pin on primary bucket pages of both old and new * buckets till end of operation. This is to prevent vacuum from starting * while a split is in progress. * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. The lock will be * released here and pin must be released by the caller. (The API is set up * this way because we must do _hash_getnewbuf() before releasing the metapage * write lock. So instead of passing the new bucket's start block number, we * pass an actual buffer.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; IndexTuple itups[MaxIndexTuplesPerPage]; Size all_tups_size = 0; int i; uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; bool found = false; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) continue; /* * Before inserting a tuple, probe the hash table containing TIDs * of tuples belonging to new bucket, if we find a match, then * skip that tuple, else fetch the item's hash key (conveniently * stored in the item) and determine which bucket it now belongs * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); if (htab) (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); if (found) continue; bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { IndexTuple new_itup; /* * make a copy of index tuple as we have to scribble on it. */ new_itup = CopyIndexTuple(itup); /* * mark the index tuple as moved by split, such tuples are * skipped by scan if there is split in progress for a bucket. */ new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { /* * Change the shared buffer state in critical section, * otherwise any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } itups[nitups++] = new_itup; all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* retain the pin on the old primary bucket */ if (obuf == bucket_obuf) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { /* * Change the shared buffer state in critical section, otherwise * any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); break; } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Mark the old and new buckets to indicate split is * finished. * * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); START_CRIT_SECTION(); oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; /* * After the split is finished, mark the old bucket to indicate that it * contains deletable tuples. We will clear split-cleanup flag after * deleting such tuples either at the end of split or at the next split * from old bucket or at the time of vacuum. */ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; /* * now write the buffers, here we don't release the locks as caller is * responsible to release locks. */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_split_complete xlrec; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); PageSetLSN(BufferGetPage(bucket_obuf), recptr); PageSetLSN(BufferGetPage(bucket_nbuf), recptr); } END_CRIT_SECTION(); /* * If possible, clean up the old bucket. We might not be able to do this * if someone else has a pin on it, but if not then we can go ahead. This * isn't absolutely necessary, but it reduces bloat; if we don't do it * now, VACUUM will do it eventually, but maybe not until new overflow * pages have been allocated. Note that there's no need to clean up the * new bucket. */ if (IsBufferCleanupOK(bucket_obuf)) { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, obucket, bucket_obuf, BufferGetBlockNumber(bucket_obuf), NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); } else { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); } }
/* * _hash_finish_split() -- Finish the previously interrupted split operation * * To complete the split operation, we form the hash table of TIDs in new * bucket which is then used by split operation to skip tuples that are * already moved before the split operation was previously interrupted. * * The caller must hold a pin, but no lock, on the metapage and old bucket's * primary page buffer. The buffers are returned in the same state. (The * metapage is only touched if it becomes necessary to add or remove overflow * pages.) */ void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask) { HASHCTL hash_ctl; HTAB *tidhtab; Buffer bucket_nbuf = InvalidBuffer; Buffer nbuf; Page npage; BlockNumber nblkno; BlockNumber bucket_nblkno; HashPageOpaque npageopaque; Bucket nbucket; bool found; /* Initialize hash tables used to track TIDs */ memset(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(ItemPointerData); hash_ctl.entrysize = sizeof(ItemPointerData); hash_ctl.hcxt = CurrentMemoryContext; tidhtab = hash_create("bucket ctids", 256, /* arbitrary initial size */ &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); /* * Scan the new bucket and build hash table of TIDs */ for (;;) { OffsetNumber noffnum; OffsetNumber nmaxoffnum; nbuf = _hash_getbuf(rel, nblkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* remember the primary bucket buffer to acquire cleanup lock on it. */ if (nblkno == bucket_nblkno) bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* Scan each tuple in new page */ nmaxoffnum = PageGetMaxOffsetNumber(npage); for (noffnum = FirstOffsetNumber; noffnum <= nmaxoffnum; noffnum = OffsetNumberNext(noffnum)) { IndexTuple itup; /* Fetch the item's TID and insert it in hash table. */ itup = (IndexTuple) PageGetItem(npage, PageGetItemId(npage, noffnum)); (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); Assert(!found); } nblkno = npageopaque->hasho_nextblkno; /* * release our write lock without modifying buffer and ensure to * retain the pin on primary bucket. */ if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* Exit loop if no more overflow pages in new bucket */ if (!BlockNumberIsValid(nblkno)) break; } /* * Conditionally get the cleanup lock on old and new buckets to perform * the split operation. If we don't get the cleanup locks, silently give * up and next insertion on old bucket will try again to complete the * split. */ if (!ConditionalLockBufferForCleanup(obuf)) { hash_destroy(tidhtab); return; } if (!ConditionalLockBufferForCleanup(bucket_nbuf)) { LockBuffer(obuf, BUFFER_LOCK_UNLOCK); hash_destroy(tidhtab); return; } npage = BufferGetPage(bucket_nbuf); npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nbucket = npageopaque->hasho_bucket; _hash_splitbucket(rel, metabuf, obucket, nbucket, obuf, bucket_nbuf, tidhtab, maxbucket, highmask, lowmask); _hash_dropbuf(rel, bucket_nbuf); hash_destroy(tidhtab); }
/* * Initialize a sequence's relation with the specified tuple as content */ static void fill_seq_with_data(Relation rel, HeapTuple tuple) { Buffer buf; Page page; sequence_magic *sm; OffsetNumber offnum; /* Initialize first page of relation with special magic number */ buf = ReadBuffer(rel, P_NEW); Assert(BufferGetBlockNumber(buf) == 0); page = BufferGetPage(buf); PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); sm = (sequence_magic *) PageGetSpecialPointer(page); sm->magic = SEQ_MAGIC; /* Now insert sequence tuple */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* * Since VACUUM does not process sequences, we have to force the tuple to * have xmin = FrozenTransactionId now. Otherwise it would become * invisible to SELECTs after 2G transactions. It is okay to do this * because if the current transaction aborts, no other xact will ever * examine the sequence tuple anyway. */ HeapTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId); HeapTupleHeaderSetXminFrozen(tuple->t_data); HeapTupleHeaderSetCmin(tuple->t_data, FirstCommandId); HeapTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId); tuple->t_data->t_infomask |= HEAP_XMAX_INVALID; ItemPointerSet(&tuple->t_data->t_ctid, 0, FirstOffsetNumber); /* check the comment above nextval_internal()'s equivalent call. */ if (RelationNeedsWAL(rel)) GetTopTransactionId(); START_CRIT_SECTION(); MarkBufferDirty(buf); offnum = PageAddItem(page, (Item) tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, false); if (offnum != FirstOffsetNumber) elog(ERROR, "failed to add sequence tuple to page"); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_seq_rec xlrec; XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); xlrec.node = rel->rd_node; XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); XLogRegisterData((char *) tuple->t_data, tuple->t_len); recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } END_CRIT_SECTION(); UnlockReleaseBuffer(buf); }
/* * Descend the tree to the leaf page that contains or would contain the key * we're searching for. The key should already be filled in 'btree', in * tree-type specific manner. If btree->fullScan is true, descends to the * leftmost leaf page. * * If 'searchmode' is false, on return stack->buffer is exclusively locked, * and the stack represents the full path to the root. Otherwise stack->buffer * is share-locked, and stack->parent is NULL. */ GinBtreeStack * ginFindLeafPage(GinBtree btree, bool searchMode) { GinBtreeStack *stack; stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); stack->blkno = btree->rootBlkno; stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); stack->parent = NULL; stack->predictNumber = 1; for (;;) { Page page; BlockNumber child; int access; stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); access = ginTraverseLock(stack->buffer, searchMode); /* * If we're going to modify the tree, finish any incomplete splits we * encounter on the way. */ if (!searchMode && GinPageIsIncompleteSplit(page)) ginFinishSplit(btree, stack, false, NULL); /* * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; if (rightlink == InvalidBlockNumber) /* rightmost page */ break; stack->buffer = ginStepRight(stack->buffer, btree->index, access); stack->blkno = rightlink; page = BufferGetPage(stack->buffer); if (!searchMode && GinPageIsIncompleteSplit(page)) ginFinishSplit(btree, stack, false, NULL); } if (GinPageIsLeaf(page)) /* we found, return locked page */ return stack; /* now we have correct buffer, try to find child */ child = btree->findChildPage(btree, stack); LockBuffer(stack->buffer, GIN_UNLOCK); Assert(child != InvalidBlockNumber); Assert(stack->blkno != child); if (searchMode) { /* in search mode we may forget path to leaf */ stack->blkno = child; stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); } else { GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); ptr->parent = stack; stack = ptr; stack->blkno = child; stack->buffer = ReadBuffer(btree->index, stack->blkno); stack->predictNumber = 1; } } }
/* * Finish a split by inserting the downlink for the new page to parent. * * On entry, stack->buffer is exclusively locked. * * If freestack is true, all the buffers are released and unlocked as we * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept * locked, and stack is unmodified, except for possibly moving right to find * the correct parent of page. */ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, GinStatsData *buildStats) { Page page; bool done; bool first = true; /* * freestack == false when we encounter an incompletely split page during a * scan, while freestack == true is used in the normal scenario that a * split is finished right after the initial insert. */ if (!freestack) elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", stack->blkno, RelationGetRelationName(btree->index)); /* this loop crawls up the stack until the insertion is complete */ do { GinBtreeStack *parent = stack->parent; void *insertdata; BlockNumber updateblkno; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); /* * If the parent page was incompletely split, finish that split first, * then continue with the current one. * * Note: we have to finish *all* incomplete splits we encounter, even * if we have to move right. Otherwise we might choose as the target * a page that has no downlink in the parent, and splitting it further * would fail. */ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) ginFinishSplit(btree, parent, false, buildStats); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { if (GinPageRightMost(page)) { /* * rightmost page, but we don't find parent, we should use * plain search... */ LockBuffer(parent->buffer, GIN_UNLOCK); ginFindParents(btree, stack); parent = stack->parent; Assert(parent != NULL); break; } parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); parent->blkno = BufferGetBlockNumber(parent->buffer); page = BufferGetPage(parent->buffer); if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) ginFinishSplit(btree, parent, false, buildStats); } /* insert the downlink */ insertdata = btree->prepareDownlink(btree, stack->buffer); updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink; done = ginPlaceToPage(btree, parent, insertdata, updateblkno, stack->buffer, buildStats); pfree(insertdata); /* * If the caller requested to free the stack, unlock and release the * child buffer now. Otherwise keep it pinned and locked, but if we * have to recurse up the tree, we can unlock the upper pages, only * keeping the page at the bottom of the stack locked. */ if (!first || freestack) LockBuffer(stack->buffer, GIN_UNLOCK); if (freestack) { ReleaseBuffer(stack->buffer); pfree(stack); } stack = parent; first = false; } while (!done); /* unlock the parent */ LockBuffer(stack->buffer, GIN_UNLOCK); if (freestack) freeGinBtreeStack(stack); }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; bool all_dead = false; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); for (;;) { bool got_heap_tuple; if (scan->xs_continue_hot) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = all_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, scan->xs_cbuf, scan->xs_snapshot, &scan->xs_ctup, &all_dead, !scan->xs_continue_hot); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { /* * Only in a non-MVCC snapshot can more than one member of the * HOT chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* Loop around to ask index AM for another TID */ scan->xs_continue_hot = false; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Insert new tuple to the bloom index. */ bool blinsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BloomState blstate; BloomTuple *itup; MemoryContext oldCtx; MemoryContext insertCtx; BloomMetaPageData *metaData; Buffer buffer, metaBuffer; Page page, metaPage; BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); initBloomState(&blstate, index); itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); /* * At first, try to insert new tuple to the first page in notFullPage * array. If successful, we don't need to modify the meta page. */ metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); if (metaData->nEnd > metaData->nStart) { Page page; blkno = metaData->notFullPage[metaData->nStart]; Assert(blkno != InvalidBlockNumber); /* Don't hold metabuffer lock while doing insert */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, buffer, 0); /* * We might have found a page that was recently deleted by VACUUM. If * so, we can reuse it, but we must reinitialize it. */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the change, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); ReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); } else { /* No entries in notFullPage */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); } /* * Try other pages in notFullPage array. We will have to change nStart in * metapage. Thus, grab exclusive lock on metapage. */ LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); /* nStart might have changed while we didn't have lock */ nStart = metaData->nStart; /* Skip first page if we already tried it above */ if (nStart < metaData->nEnd && blkno == metaData->notFullPage[nStart]) nStart++; /* * This loop iterates for each page we try from the notFullPage array, and * will also initialize a GenericXLogState for the fallback case of having * to allocate a new page. */ for (;;) { state = GenericXLogStart(index); /* get modifiable copy of metapage */ metaPage = GenericXLogRegisterBuffer(state, metaBuffer, 0); metaData = BloomPageGetMeta(metaPage); if (nStart >= metaData->nEnd) break; /* no more entries in notFullPage array */ blkno = metaData->notFullPage[nStart]; Assert(blkno != InvalidBlockNumber); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = GenericXLogRegisterBuffer(state, buffer, 0); /* Basically same logic as above */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the changes, clean up, and exit */ metaData->nStart = nStart; GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); nStart++; } /* * Didn't find place to insert in notFullPage array. Allocate new page. * (XXX is it good to do this while holding ex-lock on the metapage??) */ buffer = BloomNewBuffer(index); page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE); BloomInitPage(page, 0); if (!BloomPageAddItem(&blstate, page, itup)) { /* We shouldn't be here since we're inserting to an empty page */ elog(ERROR, "could not add new bloom tuple to empty page"); } /* Reset notFullPage array to contain just this new page */ metaData->nStart = 0; metaData->nEnd = 1; metaData->notFullPage[0] = BufferGetBlockNumber(buffer); /* Apply the changes, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns a new insertion stack, starting from the parent of "child", up * to the root. *downlinkoffnum is set to the offset of the downlink in the * direct parent of child. * * To prevent deadlocks, this should lock only one page at a time. */ static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; List *fifo; GISTInsertStack *top, *ptr; BlockNumber blkno; top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; top->downlinkoffnum = InvalidOffsetNumber; fifo = list_make1(top); while (fifo != NIL) { /* Get next page to visit */ top = linitial(fifo); fifo = list_delete_first(fifo); buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* * Because we scan the index top-down, all the rest of the pages * in the queue must be leaf pages as well. */ UnlockReleaseBuffer(buffer); break; } top->lsn = PageGetLSN(page); /* * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a * downlink. This should not normally happen.. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); if (top->parent && top->parent->lsn < GistPageGetNSN(page) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* * Page was split while we looked elsewhere. We didn't see the * downlink to the right page when we scanned the parent, so add * it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal * level, just above leaves, we might already have queued up some * leaf pages, and we assume that there can't be any non-leaf * pages behind leaf pages. */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->downlinkoffnum = InvalidOffsetNumber; ptr->parent = top->parent; fifo = lcons(ptr, fifo); } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { /* Found it! */ UnlockReleaseBuffer(buffer); *downlinkoffnum = i; return top; } else { /* Append this child to the list of pages to visit later */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->downlinkoffnum = i; ptr->parent = top; fifo = lappend(fifo, ptr); } } UnlockReleaseBuffer(buffer); } elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); return NULL; /* keep compiler quiet */ }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling * this function. Except in recovery, caller should also pass the heap * buffer. When checksums are enabled and we're not in recovery, we must add * the heap buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); /* * If data checksums are enabled (or wal_log_hints=on), we * need to protect the heap page from being torn. */ if (XLogHintBitIsNeeded()) { Page heapPage = BufferGetPage(heapBuf); /* caller is expected to set PD_ALL_VISIBLE first */ Assert(PageIsAllVisible(heapPage)); PageSetLSN(heapPage, recptr); } } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); }
/* * Updates the stack so that child->parent is the correct parent of the * child. child->parent must be exclusively locked on entry, and will * remain so at exit, but it might not be the same page anymore. */ static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); /* here we don't need to distinguish between split and page update */ if (child->downlinkoffnum == InvalidOffsetNumber || parent->lsn != PageGetLSN(parent->page)) { /* parent is changed, look child in right links until found */ OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *ptr; while (true) { maxoff = PageGetMaxOffsetNumber(parent->page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(parent->page, i); idxtuple = (IndexTuple) PageGetItem(parent->page, iid); if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) { /* yes!!, found */ child->downlinkoffnum = i; return; } } parent->blkno = GistPageGetOpaque(parent->page)->rightlink; UnlockReleaseBuffer(parent->buffer); if (parent->blkno == InvalidBlockNumber) { /* * End of chain and still didn't find parent. It's a very-very * rare situation when root splited. */ break; } parent->buffer = ReadBuffer(r, parent->blkno); LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); } /* * awful!!, we need search tree to find parent ... , but before we * should release all old parent */ ptr = child->parent->parent; /* child->parent already released * above */ while (ptr) { ReleaseBuffer(ptr->buffer); ptr = ptr->parent; } /* ok, find new path */ ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); /* read all buffers as expected by caller */ /* note we don't lock them or gistcheckpage them here! */ while (ptr) { ptr->buffer = ReadBuffer(r, ptr->blkno); ptr->page = (Page) BufferGetPage(ptr->buffer); ptr = ptr->parent; } /* install new chain of parents to stack */ child->parent = parent; /* make recursive call to normal processing */ LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); gistFindCorrectParent(r, child); } return; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * This function also deletes the tuples that are moved by split to other * bucket. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be * good enough. (If not, we'll detect that further down and refresh the * cache as necessary.) */ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); Assert(cachedmetap != NULL); orig_maxbucket = cachedmetap->hashm_maxbucket; orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; Buffer bucket_buf; Buffer buf; HashPageOpaque bucket_opaque; Page page; bool split_cleanup = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); blkno = bucket_blkno; /* * We need to acquire a cleanup lock on the primary bucket page to out * wait concurrent scans before deleting the dead tuples. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If the bucket contains tuples that are moved by split, then we need * to delete such tuples. We can't delete such tuples if the split * operation on bucket is not finished as those are needed by scans. */ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) { split_cleanup = true; /* * This bucket might have been split since we last held a lock on * the metapage. If so, hashm_maxbucket, hashm_highmask and * hashm_lowmask might be old enough to cause us to fail to remove * tuples left behind by the most recent split. To prevent that, * now that the primary page of the target bucket has been locked * (and thus can't be further split), check whether we need to * update our cached metapage data. * * NB: The check for InvalidBlockNumber is only needed for * on-disk compatibility with indexes created before we started * storing hashm_maxbucket in the primary page's hasho_prevblkno. */ if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber && bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); } } bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, cachedmetap->hashm_maxbucket, cachedmetap->hashm_highmask, cachedmetap->hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } if (BufferIsInvalid(metabuf)) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); /* Write-lock metapage and check for split since we started */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_update_meta_page xlrec; XLogRecPtr recptr; xlrec.ntuples = metap->hashm_ntuples; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(SizeOfHashUpdateMetaPage)); XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * Insert an index tuple into the index relation. The revmap is updated to * mark the range containing the given page as pointing to the inserted entry. * A WAL record is written. * * The buffer, if valid, is first checked for free space to insert the new * entry; if there isn't enough, a new buffer is obtained and pinned. No * buffer lock must be held on entry, no buffer lock is held on exit. * * Return value is the offset number where the tuple was inserted. */ OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz) { Page page; BlockNumber blk; OffsetNumber off; Buffer revmapbuf; ItemPointerData tid; bool extended; Assert(itemsz == MAXALIGN(itemsz)); /* If the item is oversized, don't even bother. */ if (itemsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return InvalidOffsetNumber; /* keep compiler quiet */ } /* Make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); /* * Acquire lock on buffer supplied by caller, if any. If it doesn't have * enough space, unpin it to obtain a new one below. */ if (BufferIsValid(*buffer)) { /* * It's possible that another backend (or ourselves!) extended the * revmap over the page we held a pin on, so we cannot assume that * it's still a regular page. */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) { UnlockReleaseBuffer(*buffer); *buffer = InvalidBuffer; } } /* * If we still don't have a usable buffer, have brin_getinsertbuffer * obtain one for us. */ if (!BufferIsValid(*buffer)) { do *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); while (!BufferIsValid(*buffer)); } else extended = false; /* Now obtain lock on revmap buffer */ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); page = BufferGetPage(*buffer); blk = BufferGetBlockNumber(*buffer); /* Execute the actual insertion */ START_CRIT_SECTION(); if (extended) brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) elog(ERROR, "could not insert new index tuple to page"); MarkBufferDirty(*buffer); BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", blk, off, heapBlk)); ItemPointerSet(&tid, blk, off); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_insert xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.heapBlk = heapBlk; xlrec.pagesPerRange = pagesPerRange; xlrec.offnum = off; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) tup, itemsz); XLogRegisterBuffer(1, revmapbuf, 0); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); /* Tuple is firmly on buffer; we can release our locks */ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return off; }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* * Truncation of a relation is WAL-logged at a higher-level, and we * will be called at WAL replay. But if checksums are enabled, we need * to still write a WAL record to protect against a torn page, if the * page is flushed to disk before the truncation WAL record. We cannot * use MarkBufferDirtyHint here, because that will not dirty the page * during recovery. */ MarkBufferDirty(buf); if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); END_CRIT_SECTION(); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; /* * Update upper-level FSM pages to account for the truncation. This is * important because the just-truncated pages were likely marked as * all-free, and would be preferentially selected. */ FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; bool extended; Assert(newsz == MAXALIGN(newsz)); /* If the item is oversized, don't bother. */ if (newsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return false; /* keep compiler quiet */ } /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); if (!BufferIsValid(newbuf)) { Assert(!extended); return false; } /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) { Assert(!extended); newbuf = InvalidBuffer; } } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * If this happens, and the new buffer was obtained by extending the * relation, then we need to ensure we don't leave it uninitialized or * forget about it. */ if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) { /* as above */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); } START_CRIT_SECTION(); PageIndexDeleteNoCompact(oldpage, &oldoff, 1); if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true, false) == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); /* * We need to initialize the page if it's newly obtained. Note we * will WAL-log the initialization as part of the update, so we don't * need to do that here. */ if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); PageIndexDeleteNoCompact(oldpage, &oldoff, 1); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); /* needed to update FSM below */ if (extended) { newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); } ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); if (extended) { Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); FreeSpaceMapVacuum(idxrel); } return true; } }
/* * Search the tree for a heap page with at least min_cat of free space */ static BlockNumber fsm_search(Relation rel, uint8 min_cat) { int restarts = 0; FSMAddress addr = FSM_ROOT_ADDRESS; for (;;) { int slot; Buffer buf; uint8 max_avail = 0; /* Read the FSM page. */ buf = fsm_readbuf(rel, addr, false); /* Search within the page */ if (BufferIsValid(buf)) { LockBuffer(buf, BUFFER_LOCK_SHARE); slot = fsm_search_avail(buf, min_cat, (addr.level == FSM_BOTTOM_LEVEL), false); if (slot == -1) max_avail = fsm_get_max_avail(BufferGetPage(buf)); UnlockReleaseBuffer(buf); } else slot = -1; if (slot != -1) { /* * Descend the tree, or return the found block if we're at the * bottom. */ if (addr.level == FSM_BOTTOM_LEVEL) return fsm_get_heap_blk(addr, slot); addr = fsm_get_child(addr, slot); } else if (addr.level == FSM_ROOT_LEVEL) { /* * At the root, failure means there's no page with enough free * space in the FSM. Give up. */ return InvalidBlockNumber; } else { uint16 parentslot; FSMAddress parent; /* * At lower level, failure can happen if the value in the upper- * level node didn't reflect the value on the lower page. Update * the upper node, to avoid falling into the same trap again, and * start over. * * There's a race condition here, if another backend updates this * page right after we release it, and gets the lock on the parent * page before us. We'll then update the parent page with the now * stale information we had. It's OK, because it should happen * rarely, and will be fixed by the next vacuum. */ parent = fsm_get_parent(addr, &parentslot); fsm_set_and_search(rel, parent, parentslot, max_avail, 0); /* * If the upper pages are badly out of date, we might need to loop * quite a few times, updating them as we go. Any inconsistencies * should eventually be corrected and the loop should end. Looping * indefinitely is nevertheless scary, so provide an emergency * valve. */ if (restarts++ > 10000) return InvalidBlockNumber; /* Start search all over from the root */ addr = FSM_ROOT_ADDRESS; } } }
/* * Return a pinned and exclusively locked buffer which can be used to insert an * index item of size itemsz (caller must ensure not to request sizes * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in * an order determined to avoid deadlocks.) * * If we find that the old page is no longer a regular index page (because * of a revmap extension), the old buffer is unlocked and we return * InvalidBuffer. * * If there's no existing page with enough free space to accommodate the new * item, the relation is extended. If this happens, *extended is set to true, * and it is the caller's responsibility to initialize the page (and WAL-log * that fact) prior to use. * * Note that in some corner cases it is possible for this routine to extend the * relation and then not return the buffer. It is this routine's * responsibility to WAL-log the page initialization and to record the page in * FSM if that happens. Such a buffer may later be reused by this routine. */ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, bool *extended) { BlockNumber oldblk; BlockNumber newblk; Page page; int freespace; /* callers must have checked */ Assert(itemsz <= BrinMaxItemSize); *extended = false; if (BufferIsValid(oldbuf)) oldblk = BufferGetBlockNumber(oldbuf); else oldblk = InvalidBlockNumber; /* * Loop until we find a page with sufficient free space. By the time we * return to caller out of this loop, both buffers are valid and locked; * if we have to restart here, neither buffer is locked and buf is not a * pinned buffer. */ newblk = RelationGetTargetBlock(irel); if (newblk == InvalidBlockNumber) newblk = GetPageWithFreeSpace(irel, itemsz); for (;;) { Buffer buf; bool extensionLockHeld = false; CHECK_FOR_INTERRUPTS(); if (newblk == InvalidBlockNumber) { /* * There's not enough free space in any existing index page, * according to the FSM: extend the relation to obtain a shiny new * page. */ if (!RELATION_IS_LOCAL(irel)) { LockRelationForExtension(irel, ExclusiveLock); extensionLockHeld = true; } buf = ReadBuffer(irel, P_NEW); newblk = BufferGetBlockNumber(buf); *extended = true; BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", BufferGetBlockNumber(buf))); } else if (newblk == oldblk) { /* * There's an odd corner-case here where the FSM is out-of-date, * and gave us the old page. */ buf = oldbuf; } else { buf = ReadBuffer(irel, newblk); } /* * We lock the old buffer first, if it's earlier than the new one; but * before we do, we need to check that it hasn't been turned into a * revmap page concurrently; if we detect that it happened, give up * and tell caller to start over. */ if (BufferIsValid(oldbuf) && oldblk < newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * It is possible that the new page was obtained from * extending the relation. In that case, we must be sure to * record it in the FSM before leaving, because otherwise the * space would be lost forever. However, we cannot let an * uninitialized page get in the FSM, so we need to initialize * it first. */ if (*extended) { brin_initialize_empty_new_buffer(irel, buf); /* shouldn't matter, but don't confuse caller */ *extended = false; } if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); ReleaseBuffer(buf); return InvalidBuffer; } } LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); page = BufferGetPage(buf); /* * We have a new buffer to insert into. Check that the new page has * enough free space, and return it if it does; otherwise start over. * Note that we allow for the FSM to be out of date here, and in that * case we update it and move on. * * (br_page_get_freespace also checks that the FSM didn't hand us a * page that has since been repurposed for the revmap.) */ freespace = *extended ? BrinMaxItemSize : br_page_get_freespace(page); if (freespace >= itemsz) { RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); /* * Since the target block specification can get lost on cache * invalidations, make sure we update the more permanent FSM with * data about it before going away. */ if (*extended) RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), freespace); /* * Lock the old buffer if not locked already. Note that in this * case we know for sure it's a regular page: it's later than the * new page we just got, which is not a revmap page, and revmap * pages are always consecutive. */ if (BufferIsValid(oldbuf) && oldblk > newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); } return buf; } /* This page is no good. */ /* * If an entirely new page does not contain enough free space for the * new item, then surely that item is oversized. Complain loudly; but * first make sure we initialize the page and record it as free, for * next time. */ if (*extended) { brin_initialize_empty_new_buffer(irel, buf); ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) freespace, RelationGetRelationName(irel)))); return InvalidBuffer; /* keep compiler quiet */ } if (newblk != oldblk) UnlockReleaseBuffer(buf); if (BufferIsValid(oldbuf) && oldblk <= newblk) LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); } }
/* * Optionally prune and repair fragmentation in the specified page. * * This is an opportunistic function. It will perform housekeeping * only if the page heuristically looks like a candidate for pruning and we * can acquire buffer cleanup lock without blocking. * * Note: this is called quite often. It's important that it fall out quickly * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). */ void heap_page_prune_opt(Relation relation, Buffer buffer) { Page page = BufferGetPage(buffer); Size minfree; TransactionId OldestXmin; /* * We can't write WAL in recovery mode, so there's no point trying to * clean the page. The master will likely issue a cleaning WAL record soon * anyway, so this is no particular loss. */ if (RecoveryInProgress()) return; /* * Use the appropriate xmin horizon for this relation. If it's a proper * catalog relation or a user defined, additional, catalog relation, we * need to use the horizon that includes slots, otherwise the data-only * horizon can be used. Note that the toast relation of user defined * relations are *not* considered catalog relations. * * It is OK to apply the old snapshot limit before acquiring the cleanup * lock because the worst that can happen is that we are not quite as * aggressive about the cleanup (by however many transaction IDs are * consumed between this point and acquiring the lock). This allows us to * save significant overhead in the case where the page is found not to be * prunable. */ if (IsCatalogRelation(relation) || RelationIsAccessibleInLogicalDecoding(relation)) OldestXmin = RecentGlobalXmin; else OldestXmin = TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, relation); Assert(TransactionIdIsValid(OldestXmin)); /* * Let's see if we really need pruning. * * Forget it if page is not hinted to contain something prunable that's * older than OldestXmin. */ if (!PageIsPrunable(page, OldestXmin)) return; /* * We prune when a previous UPDATE failed to find enough space on the page * for a new tuple version, or when free space falls below the relation's * fill-factor target (but not less than 10%). * * Checking free space here is questionable since we aren't holding any * lock on the buffer; in the worst case we could get a bogus answer. It's * unlikely to be *seriously* wrong, though, since reading either pd_lower * or pd_upper is probably atomic. Avoiding taking a lock seems more * important than sometimes getting a wrong answer in what is after all * just a heuristic estimate. */ minfree = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); minfree = Max(minfree, BLCKSZ / 10); if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { /* OK, try to get exclusive buffer lock */ if (!ConditionalLockBufferForCleanup(buffer)) return; /* * Now that we have buffer lock, get accurate information about the * page's free space, and recheck the heuristic about whether to * prune. (We needn't recheck PageIsPrunable, since no one else could * have pruned while we hold pin.) */ if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { TransactionId ignore = InvalidTransactionId; /* return value not * needed */ /* OK to prune */ (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); } /* And release buffer lock */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Remember highest leaf page number we've taken cleanup lock on; see * notes in btvacuumscan */ if (blkno > vstate->lastBlockLocked) vstate->lastBlockLocked = blkno; /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an * instruction to the replay code to get cleanup lock on all pages * between the previous lastBlockVacuumed and this page. This * ensures that WAL replay locks all leaf pages at some point. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Remember highest leaf page number we've issued a * XLOG_BTREE_VACUUM WAL record for. */ if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; MarkBufferDirtyHint(buf, true); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * We always reset xs_hot_dead; if we are here then either we are just * starting the scan, or we previously returned a visible tuple, and in * either case it's inappropriate to kill the prior index entry. */ scan->xs_hot_dead = false; for (;;) { OffsetNumber offnum; bool at_chain_start; Page dp; if (scan->xs_next_hot != InvalidOffsetNumber) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); Assert(TransactionIdIsValid(scan->xs_prev_xmax)); offnum = scan->xs_next_hot; at_chain_start = false; scan->xs_next_hot = InvalidOffsetNumber; } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); /* Prepare to scan HOT chain starting at index-referenced offnum */ offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; /* We don't know what the first tuple's xmin should be */ scan->xs_prev_xmax = InvalidTransactionId; /* Initialize flag to detect if all entries are dead */ scan->xs_hot_dead = true; } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->xs_cbuf); /* Scan through possible multiple members of HOT-chain */ for (;;) { ItemId lp; ItemPointer ctid; bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) break; lp = PageGetItemId(dp, offnum); /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) { /* We should only see a redirect at start of chain */ if (ItemIdIsRedirected(lp) && at_chain_start) { /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; continue; } /* else must be end of chain */ break; } /* * We must initialize all of *heapTuple (ie, scan->xs_ctup) since * it is returned to the executor on success. */ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); ItemPointerSetOffsetNumber(tid, offnum); heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); ctid = &heapTuple->t_data->t_ctid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. (This test * should be unnecessary, since the chain root can't be removed * while we have pin on the index entry, but let's make it * anyway.) */ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) break; /* * The xmin should match the previous xmax value, else chain is * broken. (Note: this test is not optional because it protects * us against the case where the prior chain member's xmax aborted * since we looked at it.) */ if (TransactionIdIsValid(scan->xs_prev_xmax) && !TransactionIdEquals(scan->xs_prev_xmax, HeapTupleHeaderGetXmin(heapTuple->t_data))) break; /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, scan->xs_cbuf); CheckForSerializableConflictOut(valid, scan->heapRelation, heapTuple, scan->xs_cbuf); if (valid) { /* * If the snapshot is MVCC, we know that it could accept at * most one member of the HOT chain, so we can skip examining * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ if (IsMVCCSnapshot(scan->xs_snapshot) && !IsolationIsSerializable()) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else scan->xs_next_hot = InvalidOffsetNumber; PredicateLockTuple(scan->heapRelation, heapTuple); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If we find that all * the tuples in the HOT chain are dead, we'll signal the index AM * to not return that TID on future indexscans. */ if (scan->xs_hot_dead && HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, scan->xs_cbuf) != HEAPTUPLE_DEAD) scan->xs_hot_dead = false; /* * Check to see if HOT chain continues past this tuple; if so * fetch the next offnum (we don't bother storing it into * xs_next_hot, but must store xs_prev_xmax), and loop around. */ if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(ctid); at_chain_start = false; scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else break; /* end of chain */ } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); /* Loop around to ask index AM for another TID */ scan->xs_next_hot = InvalidOffsetNumber; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
/* * Optionally prune and repair fragmentation in the specified page. * * This is an opportunistic function. It will perform housekeeping * only if the page heuristically looks like a candidate for pruning and we * can acquire buffer cleanup lock without blocking. * * Note: this is called quite often. It's important that it fall out quickly * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). */ void heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) { Page page = BufferGetPage(buffer); Size minfree; /* * Let's see if we really need pruning. * * Forget it if page is not hinted to contain something prunable that's * older than OldestXmin. */ if (!PageIsPrunable(page, OldestXmin)) return; /* * We can't write WAL in recovery mode, so there's no point trying to * clean the page. The master will likely issue a cleaning WAL record soon * anyway, so this is no particular loss. */ if (RecoveryInProgress()) return; /* * We prune when a previous UPDATE failed to find enough space on the page * for a new tuple version, or when free space falls below the relation's * fill-factor target (but not less than 10%). * * Checking free space here is questionable since we aren't holding any * lock on the buffer; in the worst case we could get a bogus answer. It's * unlikely to be *seriously* wrong, though, since reading either pd_lower * or pd_upper is probably atomic. Avoiding taking a lock seems more * important than sometimes getting a wrong answer in what is after all * just a heuristic estimate. */ minfree = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); minfree = Max(minfree, BLCKSZ / 10); if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { /* OK, try to get exclusive buffer lock */ if (!ConditionalLockBufferForCleanup(buffer)) return; /* * Now that we have buffer lock, get accurate information about the * page's free space, and recheck the heuristic about whether to * prune. (We needn't recheck PageIsPrunable, since no one else could * have pruned while we hold pin.) */ if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { TransactionId ignore = InvalidTransactionId; /* return value not * needed */ /* OK to prune */ (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); } /* And release buffer lock */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } }