/* * Return the offset of the next matching index entry. We begin the * search at offset "n" and search for matches in the direction * "dir". If no more matching entries are found on the page, * InvalidOffsetNumber is returned. */ static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir) { OffsetNumber maxoff; IndexTuple it; RTreePageOpaque po; RTreeScanOpaque so; Page p; so = (RTreeScanOpaque) s->opaque; p = BufferGetPage(so->curbuf); maxoff = PageGetMaxOffsetNumber(p); po = (RTreePageOpaque) PageGetSpecialPointer(p); /* * If we modified the index during the scan, we may have a pointer to a * ghost tuple, before the scan. If this is the case, back up one. */ if (so->s_flags & RTS_CURBEFORE) { so->s_flags &= ~RTS_CURBEFORE; n = OffsetNumberPrev(n); } while (n >= FirstOffsetNumber && n <= maxoff) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); if (po->flags & F_LEAF) { if (index_keytest(it, RelationGetDescr(s->indexRelation), s->numberOfKeys, s->keyData)) break; } else { if (index_keytest(it, RelationGetDescr(s->indexRelation), so->s_internalNKey, so->s_internalKey)) break; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } if (n >= FirstOffsetNumber && n <= maxoff) return n; /* found a match on this page */ else return InvalidOffsetNumber; /* no match, go to next page */ }
/* * Return the offset of the first index entry that is consistent with * the search key after offset 'n' in the current page. If there are * no more consistent entries, return InvalidOffsetNumber. * Page should be locked.... */ static OffsetNumber gistfindnext(IndexScanDesc scan, OffsetNumber n, ScanDirection dir) { OffsetNumber maxoff; IndexTuple it; GISTScanOpaque so; MemoryContext oldcxt; Page p; so = (GISTScanOpaque) scan->opaque; p = BufferGetPage(so->curbuf); maxoff = PageGetMaxOffsetNumber(p); /* * Make sure we're in a short-lived memory context when we invoke a * user-supplied GiST method in gistindex_keytest(), so we don't leak * memory */ oldcxt = MemoryContextSwitchTo(so->tempCxt); /* * If we modified the index during the scan, we may have a pointer to a * ghost tuple, before the scan. If this is the case, back up one. */ if (so->flags & GS_CURBEFORE) { so->flags &= ~GS_CURBEFORE; n = OffsetNumberPrev(n); } while (n >= FirstOffsetNumber && n <= maxoff) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); if (gistindex_keytest(it, scan, n)) break; if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } MemoryContextSwitchTo(oldcxt); MemoryContextReset(so->tempCxt); /* * If we found a matching entry, return its offset; otherwise return * InvalidOffsetNumber to inform the caller to go to the next page. */ if (n >= FirstOffsetNumber && n <= maxoff) return n; else return InvalidOffsetNumber; }
/* * adjustiptr() -- adjust current and marked item pointers in the scan * * Depending on the type of update and the place it happened, we * need to do nothing, to back up one record, or to start over on * the same page. */ static void adjustiptr(IndexScanDesc s, ItemPointer iptr, int op, BlockNumber blkno, OffsetNumber offnum) { OffsetNumber curoff; RTreeScanOpaque so; if (ItemPointerIsValid(iptr)) { if (ItemPointerGetBlockNumber(iptr) == blkno) { curoff = ItemPointerGetOffsetNumber(iptr); so = (RTreeScanOpaque) s->opaque; switch (op) { case RTOP_DEL: /* back up one if we need to */ if (curoff >= offnum) { if (curoff > FirstOffsetNumber) { /* just adjust the item pointer */ ItemPointerSet(iptr, blkno, OffsetNumberPrev(curoff)); } else { /* * remember that we're before the current * tuple */ ItemPointerSet(iptr, blkno, FirstOffsetNumber); if (iptr == &(s->currentItemData)) so->s_flags |= RTS_CURBEFORE; else so->s_flags |= RTS_MRKBEFORE; } } break; case RTOP_SPLIT: /* back to start of page on split */ ItemPointerSet(iptr, blkno, FirstOffsetNumber); if (iptr == &(s->currentItemData)) so->s_flags &= ~RTS_CURBEFORE; else so->s_flags &= ~RTS_MRKBEFORE; break; default: elog(ERROR, "unrecognized operation in rtree scan adjust: %d", op); } } } }
/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search * conditions, and the tree ordering. We find the first item (or, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, the page containing * the current index tuple is pinned but not locked, and data about * the matching tuple(s) on the page has been loaded into so->currPos. * scan->xs_ctup.t_self is set to the heap TID of the current tuple, * and if requested, scan->xs_itup points to a copy of the index tuple. * * If there are no matching items in the index, we return FALSE, with no * pins or locks held. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). * Within this routine, we build a temporary insertion-type scankey to use * in locating the scan start position. */ bool _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; BTStack stack; OffsetNumber offnum; StrategyNumber strat; bool nextkey; bool goback; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData scankeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; StrategyNumber strat_total; BTScanPosItem *currItem; pgstat_count_index_scan(rel); /* * Examine the scan keys and eliminate any redundant keys; also mark the * keys that must be matched to continue the scan. */ _bt_preprocess_keys(scan); /* * Quit now if _bt_preprocess_keys() discovered that the scan keys can * never be satisfied (eg, x == 1 AND x > 2). */ if (!so->qual_ok) return false; /*---------- * Examine the scan keys to discover where we need to start the scan. * * We want to identify the keys that can be used as starting boundaries; * these are =, >, or >= keys for a forward scan or =, <, <= keys for * a backwards scan. We can use keys for multiple attributes so long as * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept * a > or < boundary or find an attribute with no boundary (which can be * thought of as the same as "> -infinity"), we can't use keys for any * attributes to its right, because it would break our simplistic notion * of what initial positioning strategy to use. * * When the scan keys include cross-type operators, _bt_preprocess_keys * may not be able to eliminate redundant keys; in such cases we will * arbitrarily pick a usable one for each attribute. This is correct * but possibly not optimal behavior. (For example, with keys like * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when * x=5 would be more efficient.) Since the situation only arises given * a poorly-worded query plus an incomplete opfamily, live with it. * * When both equality and inequality keys appear for a single attribute * (again, only possible when cross-type operators appear), we *must* * select one of the equality keys for the starting point, because * _bt_checkkeys() will stop the scan as soon as an equality qual fails. * For example, if we have keys like "x >= 4 AND x = 10" and we elect to * start at x=4, we will fail and stop before reaching x=10. If multiple * equality quals survive preprocessing, however, it doesn't matter which * one we use --- by definition, they are either redundant or * contradictory. * * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. * If the index stores nulls at the end of the index we'll be starting * from, and we have no boundary key for the column (which means the key * we deduced NOT NULL from is an inequality key that constrains the other * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to * use as a boundary key. If we didn't do this, we might find ourselves * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their * first (leftmost) columns. We'll add on lower-order columns of the row * comparison below, if possible. * * The selected scan keys (at most one per index column) are remembered by * storing their addresses into the local startKeys[] array. *---------- */ strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; ScanKey chosen; ScanKey impliesNN; ScanKey cur; /* * chosen is the so-far-chosen key for the current attribute, if any. * We don't cast the decision in stone until we reach keys for the * next attribute. */ curattr = 1; chosen = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; /* * Loop iterates from 0 to numberOfKeys inclusive; we use the last * pass to handle after-last-key processing. Actual exit from the * loop is at one of the "break" statements below. */ for (cur = so->keyData, i = 0;; cur++, i++) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { /* * Done looking at keys for curattr. If we didn't find a * usable boundary key, see if we can deduce a NOT NULL key. */ if (chosen == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { /* Yes, so build the key in notnullkeys[keysCount] */ chosen = ¬nullkeys[keysCount]; ScanKeyEntryInitialize(chosen, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? BTGreaterStrategyNumber : BTLessStrategyNumber), InvalidOid, InvalidOid, InvalidOid, (Datum) 0); } /* * If we still didn't find a usable boundary key, quit; else * save the boundary key pointer in startKeys. */ if (chosen == NULL) break; startKeys[keysCount++] = chosen; /* * Adjust strat_total, and quit if we have stored a > or < * key. */ strat = chosen->sk_strategy; if (strat != BTEqualStrategyNumber) { strat_total = strat; if (strat == BTGreaterStrategyNumber || strat == BTLessStrategyNumber) break; } /* * Done if that was the last attribute, or if next key is not * in sequence (implying no boundary key is available for the * next attribute). */ if (i >= so->numberOfKeys || cur->sk_attno != curattr + 1) break; /* * Reset for next attr. */ curattr = cur->sk_attno; chosen = NULL; impliesNN = NULL; } /* * Can we use this key as a starting boundary for this attr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, * *any* inequality key works for that; we need not test.) */ switch (cur->sk_strategy) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: if (chosen == NULL) { if (ScanDirectionIsBackward(dir)) chosen = cur; else impliesNN = cur; } break; case BTEqualStrategyNumber: /* override any non-equality choice */ chosen = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: if (chosen == NULL) { if (ScanDirectionIsForward(dir)) chosen = cur; else impliesNN = cur; } break; } } } /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from * there. */ if (keysCount == 0) return _bt_endpoint(scan, dir); /* * We want to start the scan somewhere within the index. Set up an * insertion scankey we can use to search for the boundary point we * identified above. The insertion scankey is built in the local * scankeys[] array, using the keys identified by startKeys[]. */ Assert(keysCount <= INDEX_MAX_KEYS); for (i = 0; i < keysCount; i++) { ScanKey cur = startKeys[i]; Assert(cur->sk_attno == i + 1); if (cur->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead. * * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function), but we have to watch * out for nulls, which _bt_preprocess_keys didn't check. A null * in the first row member makes the condition unmatchable, just * like qual_ok = false. */ ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); Assert(subkey->sk_flags & SK_ROW_MEMBER); if (subkey->sk_flags & SK_ISNULL) return false; memcpy(scankeys + i, subkey, sizeof(ScanKeyData)); /* * If the row comparison is the last positioning key we accepted, * try to add additional keys from the lower-order row members. * (If we accepted independent conditions on additional index * columns, we use those instead --- doesn't seem worth trying to * determine which is more restrictive.) Note that this is OK * even if the row comparison is of ">" or "<" type, because the * condition applied to all but the last row member is effectively * ">=" or "<=", and so the extra keys don't break the positioning * scheme. But, by the same token, if we aren't able to use all * the row members, then the part of the row comparison that we * did use has to be treated as just a ">=" or "<=" condition, and * so we'd better adjust strat_total accordingly. */ if (i == keysCount - 1) { bool used_all_subkeys = false; Assert(!(subkey->sk_flags & SK_ROW_END)); for (;;) { subkey++; Assert(subkey->sk_flags & SK_ROW_MEMBER); if (subkey->sk_attno != keysCount + 1) break; /* out-of-sequence, can't use it */ if (subkey->sk_strategy != cur->sk_strategy) break; /* wrong direction, can't use it */ if (subkey->sk_flags & SK_ISNULL) break; /* can't use null keys */ Assert(keysCount < INDEX_MAX_KEYS); memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData)); keysCount++; if (subkey->sk_flags & SK_ROW_END) { used_all_subkeys = true; break; } } if (!used_all_subkeys) { switch (strat_total) { case BTLessStrategyNumber: strat_total = BTLessEqualStrategyNumber; break; case BTGreaterStrategyNumber: strat_total = BTGreaterEqualStrategyNumber; break; } } break; /* done with outer loop */ } } else { /* * Ordinary comparison key. Transform the search-style scan key * to an insertion scan key by replacing the sk_func with the * appropriate btree comparison function. * * If scankey operator is not a cross-type comparison, we can use * the cached comparison function; otherwise gotta look it up in * the catalogs. (That can't lead to infinite recursion, since no * indexscan initiated by syscache lookup will use cross-data-type * operators.) * * We support the convention that sk_subtype == InvalidOid means * the opclass input type; this is a hack to simplify life for * ScanKeyInit(). */ if (cur->sk_subtype == rel->rd_opcintype[i] || cur->sk_subtype == InvalidOid) { FmgrInfo *procinfo; procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); ScanKeyEntryInitializeWithInfo(scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, cur->sk_subtype, cur->sk_collation, procinfo, cur->sk_argument); } else { RegProcedure cmp_proc; cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], rel->rd_opcintype[i], cur->sk_subtype, BTORDER_PROC); if (!RegProcedureIsValid(cmp_proc)) elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, cur->sk_attno, RelationGetRelationName(rel)); ScanKeyEntryInitialize(scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, cur->sk_subtype, cur->sk_collation, cmp_proc, cur->sk_argument); } } } /*---------- * Examine the selected initial-positioning strategy to determine exactly * where we need to start the scan, and set flag variables to control the * code below. * * If nextkey = false, _bt_search and _bt_binsrch will locate the first * item >= scan key. If nextkey = true, they will locate the first * item > scan key. * * If goback = true, we will then step back one item, while if * goback = false, we will start the scan on the located item. *---------- */ switch (strat_total) { case BTLessStrategyNumber: /* * Find first item >= scankey, then back up one to arrive at last * item < scankey. (Note: this positioning strategy is only used * for a backward scan, so that is always the correct starting * position.) */ nextkey = false; goback = true; break; case BTLessEqualStrategyNumber: /* * Find first item > scankey, then back up one to arrive at last * item <= scankey. (Note: this positioning strategy is only used * for a backward scan, so that is always the correct starting * position.) */ nextkey = true; goback = true; break; case BTEqualStrategyNumber: /* * If a backward scan was specified, need to start with last equal * item not first one. */ if (ScanDirectionIsBackward(dir)) { /* * This is the same as the <= strategy. We will check at the * end whether the found item is actually =. */ nextkey = true; goback = true; } else { /* * This is the same as the >= strategy. We will check at the * end whether the found item is actually =. */ nextkey = false; goback = false; } break; case BTGreaterEqualStrategyNumber: /* * Find first item >= scankey. (This is only used for forward * scans.) */ nextkey = false; goback = false; break; case BTGreaterStrategyNumber: /* * Find first item > scankey. (This is only used for forward * scans.) */ nextkey = true; goback = false; break; default: /* can't get here, but keep compiler quiet */ elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); return false; } /* * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); /* remember which buffer we have pinned, if any */ so->currPos.buf = buf; if (!BufferIsValid(buf)) { /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. */ PredicateLockRelation(rel, scan->xs_snapshot); return false; } else PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; } else { so->currPos.moreLeft = true; so->currPos.moreRight = false; } so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); /* * If nextkey = false, we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are less * than the scan key and we know that everything on later pages is greater * than or equal to scan key. * * If nextkey = true, we are positioned at the first item > scan key, or * possibly at the end of a page on which all the existing items are less * than or equal to the scan key and we know that everything on later * pages is greater than scan key. * * The actually desired starting point is either this item or the prior * one, or in the end-of-page case it's the first item on the next page or * the last item on this page. Adjust the starting offset if needed. (If * this results in an offset before the first item or after the last one, * _bt_readpage will report no items found, and then we'll step to the * next page as needed.) */ if (goback) offnum = OffsetNumberPrev(offnum); /* * Now load data from the first page of the scan. */ if (!_bt_readpage(scan, dir, offnum)) { /* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ if (!_bt_steppage(scan, dir)) return false; } /* Drop the lock, but not pin, on the current page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true. (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.) This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey) { Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result, cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. Note * this covers two cases: the page is really empty (no keys), or it * contains only a high key. The latter case is possible after vacuuming. * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key, or first * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ cmpval = nextkey ? 0 : 1; /* select comparison value */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result >= cmpval) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (resp. > * scan key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key (resp. <= scan key). * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
/* * _bt_readpage() -- Load data from current index page into so->currPos * * Caller must have pinned and read-locked so->currPos.buf; the buffer's state * is not changed here. Also, currPos.moreLeft and moreRight must be valid; * they are updated as appropriate. All other fields of so->currPos are * initialized from scratch here. * * We scan the current page starting at offnum and moving in the indicated * direction. All items matching the scan keys are loaded into currPos.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports * that there can be no more matching tuples in the current scan direction. * * Returns true if any matching items found on the page, false if none. */ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int itemIndex; IndexTuple itup; bool continuescan; /* we must have the buffer pinned and locked */ Assert(BufferIsValid(so->currPos.buf)); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); /* * we must save the page's right-link while scanning it; this tells us * where to step right to after we're done with these items. There is no * corresponding need for the left-link, since splits always go right. */ so->currPos.nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ itemIndex = 0; offnum = Max(offnum, minoff); while (offnum <= maxoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ _bt_saveitem(so, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreRight = false; break; } offnum = OffsetNumberNext(offnum); } Assert(itemIndex <= MaxIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; } else { /* load items[] in descending order */ itemIndex = MaxIndexTuplesPerPage; offnum = Min(offnum, maxoff); while (offnum >= minoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ itemIndex--; _bt_saveitem(so, itemIndex, offnum, itup); } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; break; } offnum = OffsetNumberPrev(offnum); } Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; so->currPos.lastItem = MaxIndexTuplesPerPage - 1; so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); }
/* * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * * *bufP is the current buffer (read-locked and pinned). If we change * pages, it's updated appropriately. * * If successful, update scan's currentItemData and return true. * If no adjacent record exists in the requested direction, * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; BlockNumber blkno; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); /* done if it's not empty */ offnum = P_FIRSTDATAKEY(opaque); if (!PageIsEmpty(page) && offnum <= maxoff) break; } } } } else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility * that the page to our left splits while we are in flight to * it, plus the possibility that the page we were on gets * deleted after we leave it. See nbtree/README for details. */ for (;;) { *bufP = _bt_walk_left(rel, *bufP); /* if we're at end of scan, return failure */ if (*bufP == InvalidBuffer) { ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Okay, we managed to move left to a non-deleted page. * Done if it's not half-dead and not empty. Else loop * back and do it all again. */ if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); offnum = maxoff; if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) break; } } } } /* Update scan state */ so->btso_curbuf = *bufP; blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey. (NOTE: in particular, this means it is possible * to return a value 1 greater than the number of keys on the page, * if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey. (Since _bt_compare treats the first * data key of such a page as minus infinity, there will be at least one * key < scankey, so the result always points at one of the keys on the * page.) This key indicates the right place to descend to be sure we * find all leaf keys >= given scankey. * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey) { TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. * Note this covers two cases: the page is really empty (no keys), or * it contains only a high key. The latter case is possible after * vacuuming. This can never happen on an internal page, however, * since they are never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key. Loop * invariant: all slots before 'low' are < scan key, all slots at or * after 'high' are >= scan key. We can fall out when high == low. */ high++; /* establish the loop invariant for high */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result > 0) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (which * could be the last slot + 1). */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key. There must be * one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
/* * _hash_step() -- step to the next valid item in a scan in the bucket. * * If no valid record exists in the requested direction, return * false. Else, return true and set the CurrentItemData for the * scan to the right thing. * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. */ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ItemPointer current; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber maxoff; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; current = &(scan->currentItemData); buf = *bufP; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If _hash_step is called from _hash_first, current will not be valid, so * we can't dereference it. However, in that case, we presumably want to * start at the beginning/end of the page... */ maxoff = PageGetMaxOffsetNumber(page); if (ItemPointerIsValid(current)) offnum = ItemPointerGetOffsetNumber(current); else offnum = InvalidOffsetNumber; /* * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ do { switch (dir) { case ForwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else offnum = FirstOffsetNumber; /* new page */ while (offnum > maxoff) { /* * either this page is empty (maxoff == * InvalidOffsetNumber) or we ran off the end. */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = FirstOffsetNumber; } else { /* end of bucket */ maxoff = offnum = InvalidOffsetNumber; break; /* exit while */ } } break; case BackwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else offnum = maxoff; /* new page */ while (offnum < FirstOffsetNumber) { /* * either this page is empty (offnum == * InvalidOffsetNumber) or we ran off the end. */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) maxoff = offnum = PageGetMaxOffsetNumber(page); else { /* end of bucket */ maxoff = offnum = InvalidOffsetNumber; break; /* exit while */ } } break; default: /* NoMovementScanDirection */ /* this should not be reached */ break; } /* we ran off the end of the world without finding a match */ if (offnum == InvalidOffsetNumber) { /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } /* get ready to check this tuple */ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ blkno = BufferGetBlockNumber(buf); *bufP = so->hashso_curbuf = buf; ItemPointerSet(current, blkno, offnum); return true; }
/* * _hash_step() -- step to the next valid item in a scan in the bucket. * * If no valid record exists in the requested direction, return * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. */ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ItemPointer current; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber maxoff; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; current = &(so->hashso_curpos); buf = *bufP; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If _hash_step is called from _hash_first, current will not be valid, so * we can't dereference it. However, in that case, we presumably want to * start at the beginning/end of the page... */ maxoff = PageGetMaxOffsetNumber(page); if (ItemPointerIsValid(current)) offnum = ItemPointerGetOffsetNumber(current); else offnum = InvalidOffsetNumber; /* * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ do { switch (dir) { case ForwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum <= maxoff) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; case BackwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum >= FirstOffsetNumber) { Assert(offnum <= maxoff); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; default: /* NoMovementScanDirection */ /* this should not be reached */ itup = NULL; break; } if (itup == NULL) { /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } /* check the tuple quals, loop around if not met */ } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ blkno = BufferGetBlockNumber(buf); *bufP = so->hashso_curbuf = buf; ItemPointerSet(current, blkno, offnum); return true; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum hashbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = (HashMetaPage) BufferGetPage(metabuf); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; bool page_dirty = false; vacuum_delay_point(); buf = _hash_getbuf(rel, blkno, HASH_WRITE); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ offno = FirstOffsetNumber; maxoffno = PageGetMaxOffsetNumber(page); while (offno <= maxoffno) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* delete the item from the page */ PageIndexTupleDelete(page, offno); bucket_dirty = page_dirty = true; /* don't increment offno, instead decrement maxoffno */ maxoffno = OffsetNumberPrev(maxoffno); tuples_removed += 1; } else { offno = OffsetNumberNext(offno); num_index_tuples += 1; } } /* * Write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (page_dirty) _hash_wrtbuf(rel, buf); else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = (HashMetaPage) BufferGetPage(metabuf); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ PG_RETURN_POINTER(stats); }
static bool rtnext(IndexScanDesc s, ScanDirection dir) { Page p; OffsetNumber n; RTreePageOpaque po; RTreeScanOpaque so; so = (RTreeScanOpaque) s->opaque; if (!ItemPointerIsValid(&(s->currentItemData))) { /* first call: start at the root */ Assert(BufferIsValid(so->curbuf) == false); so->curbuf = ReadBuffer(s->indexRelation, P_ROOT); pgstat_count_index_scan(&s->xs_pgstat_info); } p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (!ItemPointerIsValid(&(s->currentItemData))) { /* first call: start at first/last offset */ if (ScanDirectionIsForward(dir)) n = FirstOffsetNumber; else n = PageGetMaxOffsetNumber(p); } else { /* go on to the next offset */ n = ItemPointerGetOffsetNumber(&(s->currentItemData)); if (ScanDirectionIsForward(dir)) n = OffsetNumberNext(n); else n = OffsetNumberPrev(n); } for (;;) { IndexTuple it; RTSTACK *stk; n = findnext(s, n, dir); /* no match on this page, so read in the next stack entry */ if (n == InvalidOffsetNumber) { /* if out of stack entries, we're done */ if (so->s_stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; return false; } stk = so->s_stack; so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, stk->rts_blk); p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(stk->rts_child); else n = OffsetNumberNext(stk->rts_child); so->s_stack = stk->rts_parent; pfree(stk); continue; } if (po->flags & F_LEAF) { ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(so->curbuf), n); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); s->xs_ctup.t_self = it->t_tid; return true; } else { BlockNumber blk; stk = (RTSTACK *) palloc(sizeof(RTSTACK)); stk->rts_child = n; stk->rts_blk = BufferGetBlockNumber(so->curbuf); stk->rts_parent = so->s_stack; so->s_stack = stk; it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); blk = ItemPointerGetBlockNumber(&(it->t_tid)); /* * Note that we release the pin on the page as we descend down the * tree, even though there's a good chance we'll eventually need * to re-read the buffer later in this scan. This may or may not * be optimal, but it doesn't seem likely to make a huge * performance difference either way. */ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk); p = BufferGetPage(so->curbuf); po = (RTreePageOpaque) PageGetSpecialPointer(p); if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; } } }
/* * Fetch a tuples that matchs the search key; this can be invoked * either to fetch the first such tuple or subsequent matching * tuples. Returns true iff a matching tuple was found. */ static int gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids, int maxtids, bool ignore_killed_tuples) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber n; GISTScanOpaque so; GISTSearchStack *stk; IndexTuple it; GISTPageOpaque opaque; int ntids = 0; so = (GISTScanOpaque) scan->opaque; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; if ( so->qual_ok == false ) return 0; if (ItemPointerIsValid(&so->curpos) == false) { /* Being asked to fetch the first entry, so start at the root */ Assert(so->curbuf == InvalidBuffer); Assert(so->stack == NULL); so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); stk->next = NULL; stk->block = GIST_ROOT_BLKNO; pgstat_count_index_scan(scan->indexRelation); } else if (so->curbuf == InvalidBuffer) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return 0; } /* * check stored pointers from last visit */ if ( so->nPageData > 0 ) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * Go to the next page */ stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); } for (;;) { /* First of all, we need lock buffer */ Assert(so->curbuf != InvalidBuffer); LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(scan->indexRelation, so->curbuf); p = BufferGetPage(so->curbuf); opaque = GistPageGetOpaque(p); /* remember lsn to identify page changed for tuple's killing */ so->stack->lsn = PageGetLSN(p); /* check page split, occured from last visit or visit to parent */ if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && XLByteLT(so->stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ && (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already added */ ) { /* detect page split, follow right link to add pages */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); stk->next = so->stack->next; stk->block = opaque->rightlink; stk->parentlsn = so->stack->parentlsn; memset(&(stk->lsn), 0, sizeof(GistNSN)); so->stack->next = stk; } /* if page is empty, then just skip it */ if (PageIsEmpty(p)) { LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); continue; } if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; /* wonderful, we can look at page */ so->nPageData = so->curPageData = 0; for (;;) { n = gistfindnext(scan, n, dir); if (!OffsetNumberIsValid(n)) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * We ran out of matching index entries on the current page, * so pop the top stack entry and use it to continue the * search. */ LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); /* XXX go up */ break; } if (GistPageIsLeaf(p)) { /* * We've found a matching index entry in a leaf page, so * return success. Note that we keep "curbuf" pinned so that * we can efficiently resume the index scan later. */ if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n)))) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); so->pageData[ so->nPageData ].heapPtr = it->t_tid; so->pageData[ so->nPageData ].pageOffset = n; so->nPageData ++; } } else { /* * We've found an entry in an internal node whose key is * consistent with the search key, so push it to stack */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); memset(&(stk->lsn), 0, sizeof(GistNSN)); stk->parentlsn = so->stack->lsn; stk->next = so->stack->next; so->stack->next = stk; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; }