/* * _bt_relandgetbuf() -- release a locked buffer and get another one. * * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer * then it reduces to just _bt_getbuf; allowing this case simplifies some * callers. * * The original motivation for using this was to avoid two entries to the * bufmgr when one would do. However, now it's mainly just a notational * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf * is when the target page is the same one already in the buffer. */ Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) { Buffer buf; Assert(blkno != P_NEW); if (BufferIsValid(obuf)) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); buf = ReleaseAndReadBuffer(obuf, rel, blkno); LockBuffer(buf, access); _bt_checkpage(rel, buf); return buf; }
/* * Locates leaf page contained tuple */ RumBtreeStack * rumReFindLeafPage(RumBtree btree, RumBtreeStack * stack) { /* * Traverse the tree upwards until we sure that requested leaf page is in * this subtree. Or we can stop at root page. */ while (stack->parent) { RumBtreeStack *ptr; Page page; OffsetNumber maxoff; LockBuffer(stack->buffer, RUM_UNLOCK); stack->parent->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->parent->blkno); LockBuffer(stack->parent->buffer, RUM_SHARE); ptr = stack; stack = stack->parent; pfree(ptr); page = BufferGetPage(stack->buffer); maxoff = RumPageGetOpaque(page)->maxoff; /* * We don't know right bound of rightmost pointer. So, we can be sure * that requested leaf page is in this subtree only when requested * item pointer is less than item pointer previous to rightmost. */ if (compareRumItem(btree->rumstate, btree->entryAttnum, &(((PostingItem *) RumDataPageGetItem(page, maxoff - 1))->item), &btree->items[btree->curitem]) >= 0) { break; } } /* Traverse tree downwards. */ stack = rumFindLeafPage(btree, stack); return stack; }
/* * Descend the tree to the leaf page that contains or would contain the key * we're searching for. The key should already be filled in 'btree', in * tree-type specific manner. If btree->fullScan is true, descends to the * leftmost leaf page. * * If 'searchmode' is false, on return stack->buffer is exclusively locked, * and the stack represents the full path to the root. Otherwise stack->buffer * is share-locked, and stack->parent is NULL. */ GinBtreeStack * ginFindLeafPage(GinBtree btree, bool searchMode) { GinBtreeStack *stack; stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); stack->blkno = btree->rootBlkno; stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); stack->parent = NULL; stack->predictNumber = 1; for (;;) { Page page; BlockNumber child; int access; stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); access = ginTraverseLock(stack->buffer, searchMode); /* * If we're going to modify the tree, finish any incomplete splits we * encounter on the way. */ if (!searchMode && GinPageIsIncompleteSplit(page)) ginFinishSplit(btree, stack, false, NULL); /* * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; if (rightlink == InvalidBlockNumber) /* rightmost page */ break; stack->buffer = ginStepRight(stack->buffer, btree->index, access); stack->blkno = rightlink; page = BufferGetPage(stack->buffer); if (!searchMode && GinPageIsIncompleteSplit(page)) ginFinishSplit(btree, stack, false, NULL); } if (GinPageIsLeaf(page)) /* we found, return locked page */ return stack; /* now we have correct buffer, try to find child */ child = btree->findChildPage(btree, stack); LockBuffer(stack->buffer, GIN_UNLOCK); Assert(child != InvalidBlockNumber); Assert(stack->blkno != child); if (searchMode) { /* in search mode we may forget path to leaf */ stack->blkno = child; stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); } else { GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); ptr->parent = stack; stack = ptr; stack->blkno = child; stack->buffer = ReadBuffer(btree->index, stack->blkno); stack->predictNumber = 1; } } }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; int ntup; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; ntup = 0; /* * Prune and repair fragmentation for the whole page, if possible. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We need two separate strategies for lossy and non-lossy cases. */ if (tbmres->ntuples >= 0) { /* * Bitmap is non-lossy, so we just look through the offsets listed in * tbmres; but we have to follow any HOT chain starting at each such * offset. */ int curslot; for (curslot = 0; curslot < tbmres->ntuples; curslot++) { OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL)) scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } else { /* * Bitmap is lossy, so we must examine each item pointer on the page. * But we can ignore HOT chains, since we'll check each tuple anyway. */ Page dp = (Page) BufferGetPage(buffer); OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); OffsetNumber offnum; for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp; HeapTupleData loctup; bool valid; lp = PageGetItemId(dp, offnum); if (!ItemIdIsNormal(lp)) continue; loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); loctup.t_tableOid = scan->rs_rd->rd_id; ItemPointerSet(&loctup.t_self, page, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { scan->rs_vistuples[ntup++] = offnum; PredicateLockTuple(scan->rs_rd, &loctup, snapshot); } CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer, snapshot); } } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }
static void rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate) { Page page; Buffer buffer; BlockNumber blk; IndexTuple which; OffsetNumber l; RTSTACK *stack; RTreePageOpaque opaque; Datum datum; blk = P_ROOT; buffer = InvalidBuffer; stack = NULL; do { /* release the current buffer, read in the next one */ buffer = ReleaseAndReadBuffer(buffer, r, blk); page = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(page); if (!(opaque->flags & F_LEAF)) { RTSTACK *n; ItemId iid; n = (RTSTACK *) palloc(sizeof(RTSTACK)); n->rts_parent = stack; n->rts_blk = blk; n->rts_child = choose(r, page, itup, rtstate); stack = n; iid = PageGetItemId(page, n->rts_child); which = (IndexTuple) PageGetItem(page, iid); blk = ItemPointerGetBlockNumber(&(which->t_tid)); } } while (!(opaque->flags & F_LEAF)); if (nospace(page, itup)) { /* need to do a split */ rtdosplit(r, buffer, stack, itup, rtstate); freestack(stack); WriteBuffer(buffer); /* don't forget to release buffer! */ return; } /* add the item and write the buffer */ if (PageIsEmpty(page)) { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), FirstOffsetNumber, LP_USED); } else { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), OffsetNumberNext(PageGetMaxOffsetNumber(page)), LP_USED); } if (l == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); WriteBuffer(buffer); datum = IndexTupleGetDatum(itup); /* now expand the page boundary in the parent to include the new child */ rttighten(r, stack, datum, IndexTupleAttSize(itup), rtstate); freestack(stack); }
Datum ginbulkdelete(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.index = index; gvs.result = stats; gvs.callback = callback; gvs.callback_state = callback_state; initGinState(&gvs.ginstate, index); // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buffer = ReadBuffer(index, blkno); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinItemPointerGetBlockNumber(&(itup)->t_tid); Assert(blkno != InvalidBlockNumber); LockBuffer(buffer, GIN_UNLOCK); buffer = ReleaseAndReadBuffer(buffer, index, blkno); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_EXCLUSIVE); } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- PG_RETURN_POINTER(gvs.result); }
/* ---------------- * index_fetch_heap - get the scan's next heap tuple * * The result is a visible heap tuple associated with the index TID most * recently fetched by index_getnext_tid, or NULL if no more matching tuples * exist. (There can be more than one matching tuple because of HOT chains, * although when using an MVCC snapshot it should be impossible for more than * one such tuple to exist.) * * On success, the buffer containing the heap tup is pinned (the pin will be * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan * call). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_fetch_heap(IndexScanDesc scan) { ItemPointer tid = &scan->xs_ctup.t_self; bool all_dead = false; bool got_heap_tuple; /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ if (!scan->xs_continue_hot) { /* Switch to correct buffer if we don't have it already */ Buffer prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, scan->xs_cbuf, scan->xs_snapshot, &scan->xs_ctup, &all_dead, !scan->xs_continue_hot); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { /* * Only in a non-MVCC snapshot can more than one member of the HOT * chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); return &scan->xs_ctup; } /* We've reached the end of the HOT chain. */ scan->xs_continue_hot = false; /* * If we scanned a whole HOT chain and found only dead tuples, tell index * AM to kill its entry for that TID (this will take effect in the next * amgettuple call, in index_getnext_tid). We do not do this when in * recovery because it may violate MVCC to do so. See comments in * RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = all_dead; return NULL; }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; Page dp; int ntup; int curslot; int minslot; int maxslot; int maxoff; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(dp); /* * Determine how many entries we need to look at on this page. If the * bitmap is lossy then we need to look at each physical item pointer; * otherwise we just look through the offsets listed in tbmres. */ if (tbmres->ntuples >= 0) { /* non-lossy case */ minslot = 0; maxslot = tbmres->ntuples - 1; } else { /* lossy case */ minslot = FirstOffsetNumber; maxslot = maxoff; } ntup = 0; for (curslot = minslot; curslot <= maxslot; curslot++) { OffsetNumber targoffset; ItemId lp; HeapTupleData loctup; bool valid; if (tbmres->ntuples >= 0) { /* non-lossy case */ targoffset = tbmres->offsets[curslot]; } else { /* lossy case */ targoffset = (OffsetNumber) curslot; } /* * We'd better check for out-of-range offnum in case of VACUUM since * the TID was obtained. */ if (targoffset < FirstOffsetNumber || targoffset > maxoff) continue; lp = PageGetItemId(dp, targoffset); /* * Must check for deleted tuple. */ if (!ItemIdIsUsed(lp)) continue; /* * check time qualification of tuple, remember it if valid */ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&(loctup.t_self), page, targoffset); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) scan->rs_vistuples[ntup++] = targoffset; } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * We always reset xs_hot_dead; if we are here then either we are just * starting the scan, or we previously returned a visible tuple, and in * either case it's inappropriate to kill the prior index entry. */ scan->xs_hot_dead = false; for (;;) { OffsetNumber offnum; bool at_chain_start; Page dp; if (scan->xs_next_hot != InvalidOffsetNumber) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); Assert(TransactionIdIsValid(scan->xs_prev_xmax)); offnum = scan->xs_next_hot; at_chain_start = false; scan->xs_next_hot = InvalidOffsetNumber; } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); /* Prepare to scan HOT chain starting at index-referenced offnum */ offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; /* We don't know what the first tuple's xmin should be */ scan->xs_prev_xmax = InvalidTransactionId; /* Initialize flag to detect if all entries are dead */ scan->xs_hot_dead = true; } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->xs_cbuf); /* Scan through possible multiple members of HOT-chain */ for (;;) { ItemId lp; ItemPointer ctid; bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) break; lp = PageGetItemId(dp, offnum); /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) { /* We should only see a redirect at start of chain */ if (ItemIdIsRedirected(lp) && at_chain_start) { /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; continue; } /* else must be end of chain */ break; } /* * We must initialize all of *heapTuple (ie, scan->xs_ctup) since * it is returned to the executor on success. */ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); ItemPointerSetOffsetNumber(tid, offnum); heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); ctid = &heapTuple->t_data->t_ctid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. (This test * should be unnecessary, since the chain root can't be removed * while we have pin on the index entry, but let's make it * anyway.) */ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) break; /* * The xmin should match the previous xmax value, else chain is * broken. (Note: this test is not optional because it protects * us against the case where the prior chain member's xmax aborted * since we looked at it.) */ if (TransactionIdIsValid(scan->xs_prev_xmax) && !TransactionIdEquals(scan->xs_prev_xmax, HeapTupleHeaderGetXmin(heapTuple->t_data))) break; /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, scan->xs_cbuf); CheckForSerializableConflictOut(valid, scan->heapRelation, heapTuple, scan->xs_cbuf); if (valid) { /* * If the snapshot is MVCC, we know that it could accept at * most one member of the HOT chain, so we can skip examining * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ if (IsMVCCSnapshot(scan->xs_snapshot) && !IsolationIsSerializable()) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else scan->xs_next_hot = InvalidOffsetNumber; PredicateLockTuple(scan->heapRelation, heapTuple); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If we find that all * the tuples in the HOT chain are dead, we'll signal the index AM * to not return that TID on future indexscans. */ if (scan->xs_hot_dead && HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, scan->xs_cbuf) != HEAPTUPLE_DEAD) scan->xs_hot_dead = false; /* * Check to see if HOT chain continues past this tuple; if so * fetch the next offnum (we don't bother storing it into * xs_next_hot, but must store xs_prev_xmax), and loop around. */ if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(ctid); at_chain_start = false; scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else break; /* end of chain */ } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); /* Loop around to ask index AM for another TID */ scan->xs_next_hot = InvalidOffsetNumber; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
static bool rtnext(IndexScanDesc s, ScanDirection dir) { Page p; OffsetNumber n; RTreePageOpaque po; RTreeScanOpaque so; so = (RTreeScanOpaque) s->opaque; if (!ItemPointerIsValid(&(s->currentItemData))) { /* first call: start at the root */ Assert(BufferIsValid(so->curbuf) == false); so->curbuf = ReadBuffer(s->indexRelation, P_ROOT); pgstat_count_index_scan(&s->xs_pgstat_info); } p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (!ItemPointerIsValid(&(s->currentItemData))) { /* first call: start at first/last offset */ if (ScanDirectionIsForward(dir)) n = FirstOffsetNumber; else n = PageGetMaxOffsetNumber(p); } else { /* go on to the next offset */ n = ItemPointerGetOffsetNumber(&(s->currentItemData)); if (ScanDirectionIsForward(dir)) n = OffsetNumberNext(n); else n = OffsetNumberPrev(n); } for (;;) { IndexTuple it; RTSTACK *stk; n = findnext(s, n, dir); /* no match on this page, so read in the next stack entry */ if (n == InvalidOffsetNumber) { /* if out of stack entries, we're done */ if (so->s_stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; return false; } stk = so->s_stack; so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, stk->rts_blk); p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(stk->rts_child); else n = OffsetNumberNext(stk->rts_child); so->s_stack = stk->rts_parent; pfree(stk); continue; } if (po->flags & F_LEAF) { ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(so->curbuf), n); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); s->xs_ctup.t_self = it->t_tid; return true; } else { BlockNumber blk; stk = (RTSTACK *) palloc(sizeof(RTSTACK)); stk->rts_child = n; stk->rts_blk = BufferGetBlockNumber(so->curbuf); stk->rts_parent = so->s_stack; so->s_stack = stk; it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); blk = ItemPointerGetBlockNumber(&(it->t_tid)); /* * Note that we release the pin on the page as we descend down the * tree, even though there's a good chance we'll eventually need * to re-read the buffer later in this scan. This may or may not * be optimal, but it doesn't seem likely to make a huge * performance difference either way. */ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk); p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; } } }
/* * Fetch a tuples that matchs the search key; this can be invoked * either to fetch the first such tuple or subsequent matching * tuples. Returns true iff a matching tuple was found. */ static int gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids, int maxtids, bool ignore_killed_tuples) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber n; GISTScanOpaque so; GISTSearchStack *stk; IndexTuple it; GISTPageOpaque opaque; int ntids = 0; so = (GISTScanOpaque) scan->opaque; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; if ( so->qual_ok == false ) return 0; if (ItemPointerIsValid(&so->curpos) == false) { /* Being asked to fetch the first entry, so start at the root */ Assert(so->curbuf == InvalidBuffer); Assert(so->stack == NULL); so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); stk->next = NULL; stk->block = GIST_ROOT_BLKNO; pgstat_count_index_scan(scan->indexRelation); } else if (so->curbuf == InvalidBuffer) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return 0; } /* * check stored pointers from last visit */ if ( so->nPageData > 0 ) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * Go to the next page */ stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); } for (;;) { /* First of all, we need lock buffer */ Assert(so->curbuf != InvalidBuffer); LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(scan->indexRelation, so->curbuf); p = BufferGetPage(so->curbuf); opaque = GistPageGetOpaque(p); /* remember lsn to identify page changed for tuple's killing */ so->stack->lsn = PageGetLSN(p); /* check page split, occured from last visit or visit to parent */ if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && XLByteLT(so->stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ && (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already added */ ) { /* detect page split, follow right link to add pages */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); stk->next = so->stack->next; stk->block = opaque->rightlink; stk->parentlsn = so->stack->parentlsn; memset(&(stk->lsn), 0, sizeof(GistNSN)); so->stack->next = stk; } /* if page is empty, then just skip it */ if (PageIsEmpty(p)) { LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); continue; } if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; /* wonderful, we can look at page */ so->nPageData = so->curPageData = 0; for (;;) { n = gistfindnext(scan, n, dir); if (!OffsetNumberIsValid(n)) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * We ran out of matching index entries on the current page, * so pop the top stack entry and use it to continue the * search. */ LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); /* XXX go up */ break; } if (GistPageIsLeaf(p)) { /* * We've found a matching index entry in a leaf page, so * return success. Note that we keep "curbuf" pinned so that * we can efficiently resume the index scan later. */ if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n)))) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); so->pageData[ so->nPageData ].heapPtr = it->t_tid; so->pageData[ so->nPageData ].pageOffset = n; so->nPageData ++; } } else { /* * We've found an entry in an internal node whose key is * consistent with the search key, so push it to stack */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); memset(&(stk->lsn), 0, sizeof(GistNSN)); stk->parentlsn = so->stack->lsn; stk->next = so->stack->next; so->stack->next = stk; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; bool all_dead = false; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); for (;;) { bool got_heap_tuple; if (scan->xs_continue_hot) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = all_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, scan->xs_cbuf, scan->xs_snapshot, &scan->xs_ctup, &all_dead, !scan->xs_continue_hot); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { /* * Only in a non-MVCC snapshot can more than one member of the * HOT chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* Loop around to ask index AM for another TID */ scan->xs_continue_hot = false; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
/* * Gets next ItemPointer from PostingTree. Note, that we copy * page into GinScanEntry->list array and unlock page, but keep it pinned * to prevent interference with vacuum */ static void entryGetNextItem(Relation index, GinScanEntry entry) { Page page; BlockNumber blkno; for(;;) { entry->offset++; if (entry->offset <= entry->nlist) { entry->curItem = entry->list[entry->offset - 1]; return; } LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); for(;;) { /* * It's needed to go by right link. During that we should refind * first ItemPointer greater that stored */ blkno = GinPageGetOpaque(page)->rightlink; LockBuffer(entry->buffer, GIN_UNLOCK); if (blkno == InvalidBlockNumber) { ReleaseBuffer(entry->buffer); ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno); LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset)) { /* * Found position equal to or greater than stored */ entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) ); LockBuffer(entry->buffer, GIN_UNLOCK); if ( !ItemPointerIsValid(&entry->curItem) || compareItemPointers( &entry->curItem, entry->list + entry->offset - 1 ) == 0 ) { /* * First pages are deleted or empty, or we found exact position, * so break inner loop and continue outer one. */ break; } /* * Find greater than entry->curItem position, store it. */ entry->curItem = entry->list[entry->offset - 1]; return; } } } }
/* * Insert value (stored in GinBtree) to tree described by stack * * During an index build, buildStats is non-null and the counters * it contains should be incremented as needed. * * NB: the passed-in stack is freed, as though by freeGinBtreeStack. */ void ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { GinBtreeStack *parent = stack; BlockNumber rootBlkno = InvalidBuffer; Page page, rpage, lpage; /* remember root BlockNumber */ while (parent) { rootBlkno = parent->blkno; parent = parent->parent; } while (stack) { XLogRecData *rdata; BlockNumber savedRightLink; page = BufferGetPage(stack->buffer); savedRightLink = GinPageGetOpaque(page)->rightlink; if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) { START_CRIT_SECTION(); btree->placeToPage(btree, stack->buffer, stack->off, &rdata); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); return; } else { Buffer rbuffer = GinNewBuffer(btree->index); Page newlpage; /* * newlpage is a pointer to memory page, it doesn't associate with * buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; /* During index build, count the newly-split page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } parent = stack->parent; if (parent == NULL) { /* * split root, so we need to allocate new left page and place * pointer on root to left and right page */ Buffer lbuffer = GinNewBuffer(btree->index); ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; page = BufferGetPage(stack->buffer); lpage = BufferGetPage(lbuffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer); START_CRIT_SECTION(); GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF); PageRestoreTempPage(newlpage, lpage); btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer); MarkBufferDirty(rbuffer); MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); /* During index build, count the newly-added root page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } return; } else { /* split non-root page */ ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE; ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink; lpage = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); START_CRIT_SECTION(); PageRestoreTempPage(newlpage, lpage); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); END_CRIT_SECTION(); } } btree->isDelete = FALSE; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; LockBuffer(parent->buffer, GIN_UNLOCK); if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ ginFindParents(btree, stack, rootBlkno); parent = stack->parent; page = BufferGetPage(parent->buffer); break; } parent->blkno = rightlink; parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); LockBuffer(parent->buffer, GIN_EXCLUSIVE); page = BufferGetPage(parent->buffer); } UnlockReleaseBuffer(stack->buffer); pfree(stack); stack = parent; } }
/* * Locates leaf page contained tuple */ RumBtreeStack * rumFindLeafPage(RumBtree btree, RumBtreeStack * stack) { bool isfirst = true; BlockNumber rootBlkno; if (!stack) stack = rumPrepareFindLeafPage(btree, RUM_ROOT_BLKNO); rootBlkno = stack->blkno; for (;;) { Page page; BlockNumber child; int access = RUM_SHARE; stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); if (isfirst) { if (RumPageIsLeaf(page) && !btree->searchMode) access = RUM_EXCLUSIVE; isfirst = false; } else access = rumTraverseLock(stack->buffer, btree->searchMode); /* * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ while (btree->fullScan == false && stack->blkno != rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = RumPageGetOpaque(page)->rightlink; if (rightlink == InvalidBlockNumber) /* rightmost page */ break; stack->buffer = rumStep(stack->buffer, btree->index, access, ForwardScanDirection); stack->blkno = rightlink; page = BufferGetPage(stack->buffer); } if (RumPageIsLeaf(page)) /* we found, return locked page */ return stack; /* now we have correct buffer, try to find child */ child = btree->findChildPage(btree, stack); LockBuffer(stack->buffer, RUM_UNLOCK); Assert(child != InvalidBlockNumber); Assert(stack->blkno != child); if (btree->searchMode) { /* in search mode we may forget path to leaf */ RumBtreeStack *ptr = (RumBtreeStack *) palloc(sizeof(RumBtreeStack)); Buffer buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, child); ptr->parent = stack; ptr->predictNumber = stack->predictNumber; stack->buffer = InvalidBuffer; stack = ptr; stack->blkno = child; stack->buffer = buffer; } else { RumBtreeStack *ptr = (RumBtreeStack *) palloc(sizeof(RumBtreeStack)); ptr->parent = stack; stack = ptr; stack->blkno = child; stack->buffer = ReadBuffer(btree->index, stack->blkno); stack->predictNumber = 1; } } }
/* * Locates leaf page contained tuple */ GinBtreeStack * ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) { bool isfirst = TRUE; BlockNumber rootBlkno; if (!stack) stack = ginPrepareFindLeafPage(btree, GIN_ROOT_BLKNO); rootBlkno = stack->blkno; for (;;) { Page page; BlockNumber child; int access = GIN_SHARE; stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); if (isfirst) { if (GinPageIsLeaf(page) && !btree->searchMode) access = GIN_EXCLUSIVE; isfirst = FALSE; } else access = ginTraverseLock(stack->buffer, btree->searchMode); /* * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ while (btree->fullScan == FALSE && stack->blkno != rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; if (rightlink == InvalidBlockNumber) /* rightmost page */ break; stack->blkno = rightlink; LockBuffer(stack->buffer, GIN_UNLOCK); stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); LockBuffer(stack->buffer, access); page = BufferGetPage(stack->buffer); } if (GinPageIsLeaf(page)) /* we found, return locked page */ return stack; /* now we have correct buffer, try to find child */ child = btree->findChildPage(btree, stack); LockBuffer(stack->buffer, GIN_UNLOCK); Assert(child != InvalidBlockNumber); Assert(stack->blkno != child); if (btree->searchMode) { /* in search mode we may forget path to leaf */ stack->blkno = child; stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); } else { GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); ptr->parent = stack; stack = ptr; stack->blkno = child; stack->buffer = ReadBuffer(btree->index, stack->blkno); stack->predictNumber = 1; } } /* keep compiler happy */ return NULL; }
/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { EState *estate; ExprContext *econtext; HeapScanDesc scandesc; Index scanrelid; TIDBitmap *tbm; TBMIterateResult *tbmres; OffsetNumber targoffset; TupleTableSlot *slot; OnDiskBitmapWords *odbm; ODBMIterateResult *odbmres; bool inmem = false; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scandesc = node->ss.ss_currentScanDesc; scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid; tbm = node->tbm; tbmres = node->tbmres; odbm = node->odbm; odbmres = node->odbmres; /* * Clear any reference to the previously returned tuple. The idea here is * to not have the tuple slot be the last holder of a pin on that tuple's * buffer; if it is, we'll need a separate visit to the bufmgr to release * the buffer. By clearing here, we get to have the release done by * ReleaseAndReadBuffer, below. */ ExecClearTuple(slot); /* * Check if we are evaluating PlanQual for tuple of this relation. * Additional checking is not good, but no other way for now. We could * introduce new nodes for this case and handle IndexScan --> NewNode * switching in Init/ReScan plan... */ if (estate->es_evTuple != NULL && estate->es_evTuple[scanrelid - 1] != NULL) { if (estate->es_evTupleNull[scanrelid - 1]) return slot; /* return empty slot */ ExecStoreTuple(estate->es_evTuple[scanrelid - 1], slot, InvalidBuffer, false); /* Does the tuple meet the original qual conditions? */ econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) ExecClearTuple(slot); /* would not be returned by scan */ /* Flag for the next call that no more tuples */ estate->es_evTupleNull[scanrelid - 1] = true; return slot; } /* check if this requires in-mem bitmap scan or on-disk bitmap index. */ inmem = ((BitmapHeapScan*)(((PlanState*)node)->plan))->inmem; /* * If the underline indexes are on disk bitmap indexes */ if (!inmem) { uint64 nextTid = 0; if (odbm == NULL) { odbm = odbm_create(ODBM_MAX_WORDS); node->odbm = odbm; } if (odbmres == NULL) { odbmres = odbm_res_create(odbm); node->odbmres = odbmres; } for (;;) { /* If we have used up the words from previous scan, or we haven't scan the underlying index scan for wrods yet, then do it. */ if (odbm->numOfWords == 0 && odbmres->nextTidLoc >= odbmres->numOfTids) { Plan* outerPlan = (((PlanState*)node)->lefttree)->plan; odbm_set_bitmaptype(outerPlan, false); odbm->firstTid = odbmres->nextTid; odbm->startNo = 0; odbm_set_child_resultnode(((PlanState*)node)->lefttree, odbm); odbm = (OnDiskBitmapWords *) MultiExecProcNode(outerPlanState(node)); if (!odbm || !IsA(odbm, OnDiskBitmapWords)) elog(ERROR, "unrecognized result from subplan"); odbm_begin_iterate(node->odbm, node->odbmres); } /* If we can not find more words, then this scan is over. */ if (odbm == NULL || (odbm->numOfWords == 0 && odbmres->nextTidLoc >= odbmres->numOfTids)) return ExecClearTuple(slot); nextTid = odbm_findnexttid(odbm, odbmres); if (nextTid == 0) continue; ItemPointerSet(&scandesc->rs_ctup.t_self, (nextTid-1)/MaxNumHeapTuples, ((nextTid-1)%MaxNumHeapTuples)+1); /* fetch the heap tuple and see if it matches the snapshot. */ if (heap_release_fetch(scandesc->rs_rd, scandesc->rs_snapshot, &scandesc->rs_ctup, &scandesc->rs_cbuf, true, &scandesc->rs_pgstat_info)) { /* * Set up the result slot to point to this tuple. * Note that the slot acquires a pin on the buffer. */ ExecStoreTuple(&scandesc->rs_ctup, slot, scandesc->rs_cbuf, false); /* return this tuple */ return slot; } } } /* * If we haven't yet performed the underlying index scan, do it, and * prepare the bitmap to be iterated over. */ if (tbm == NULL) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmres = tbmres = NULL; tbm_begin_iterate(tbm); } for (;;) { /* * Get next page of results if needed */ if (tbmres == NULL) { node->tbmres = tbmres = tbm_iterate(tbm); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got * AccessShareLock before performing any of the indexscans, but * let's be safe.) */ if (tbmres->blockno >= scandesc->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * Acquire pin on the current heap page. We'll hold the pin until * done looking at the page. We trade in any pin we held before. */ scandesc->rs_cbuf = ReleaseAndReadBuffer(scandesc->rs_cbuf, scandesc->rs_rd, tbmres->blockno); /* * Determine how many entries we need to look at on this page. If * the bitmap is lossy then we need to look at each physical item * pointer; otherwise we just look through the offsets listed in * tbmres. */ if (tbmres->ntuples >= 0) { /* non-lossy case */ node->minslot = 0; node->maxslot = tbmres->ntuples - 1; } else { /* lossy case */ Page dp; LockBuffer(scandesc->rs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scandesc->rs_cbuf); node->minslot = FirstOffsetNumber; node->maxslot = PageGetMaxOffsetNumber(dp); LockBuffer(scandesc->rs_cbuf, BUFFER_LOCK_UNLOCK); } /* * Set curslot to first slot to examine */ node->curslot = node->minslot; } else { /* * Continuing in previously obtained page; advance curslot */ node->curslot++; } /* * Out of range? If so, nothing more to look at on this page */ if (node->curslot < node->minslot || node->curslot > node->maxslot) { node->tbmres = tbmres = NULL; continue; } /* * Okay to try to fetch the tuple */ if (tbmres->ntuples >= 0) { /* non-lossy case */ targoffset = tbmres->offsets[node->curslot]; } else { /* lossy case */ targoffset = (OffsetNumber) node->curslot; } ItemPointerSet(&scandesc->rs_ctup.t_self, tbmres->blockno, targoffset); /* * Fetch the heap tuple and see if it matches the snapshot. We use * heap_release_fetch to avoid useless bufmgr traffic. */ if (heap_release_fetch(scandesc->rs_rd, scandesc->rs_snapshot, &scandesc->rs_ctup, &scandesc->rs_cbuf, true, &scandesc->rs_pgstat_info)) { /* * Set up the result slot to point to this tuple. Note that the * slot acquires a pin on the buffer. */ ExecStoreTuple(&scandesc->rs_ctup, slot, scandesc->rs_cbuf, false); /* * If we are using lossy info, we have to recheck the qual * conditions at every tuple. */ if (tbmres->ntuples < 0) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ ExecClearTuple(slot); continue; } } /* OK to return this tuple */ return slot; } /* * Failed the snap, so loop back and try again. */ } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }