/** * @brief Get the next smaller item from the old index * * Process flow * -# Examine the max offset position in the page * -# Search the next item * -# If the item has deleted flag, seearch the next one * -# If we can't find items any more, read the leaf page on the right side * and search the next again * * These members are updated: * - page : page which includes picked-up item * - offnum : item offset number of the picked-up item * * @param reader [in/out] BTReader structure * @return next index tuple, or null if no more tuples */ static IndexTuple BTReaderGetNextItem(BTReader *reader) { OffsetNumber maxoff; ItemId itemid; BTPageOpaque opaque; /* * If any leaf page isn't read, the state is treated like as EOF */ if (reader->blkno == InvalidBlockNumber) return NULL; maxoff = PageGetMaxOffsetNumber(reader->page); for (;;) { /* * If no one items are picked up, offnum is set to InvalidOffsetNumber. */ if (reader->offnum == InvalidOffsetNumber) { opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); reader->offnum = P_FIRSTDATAKEY(opaque); } else reader->offnum = OffsetNumberNext(reader->offnum); if (reader->offnum <= maxoff) { itemid = PageGetItemId(reader->page, reader->offnum); /* Ignore dead items */ if (ItemIdIsDead(itemid)) continue; return (IndexTuple) PageGetItem(reader->page, itemid); } else { /* The end of the leaf page. Go right. */ opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); if (P_RIGHTMOST(opaque)) return NULL; /* No more index tuples */ BTReaderReadPage(reader, opaque->btpo_next); maxoff = PageGetMaxOffsetNumber(reader->page); } } }
/* * pgstat_btree_page -- check tuples in a btree page */ static void pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, BufferAccessStrategy bstrategy) { Buffer buf; Page page; buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); /* Page is valid, see what to do with it */ if (PageIsNew(page)) { /* fully empty page */ stat->free_space += BLCKSZ; } else { BTPageOpaque opaque; opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD)) { /* recyclable page */ stat->free_space += BLCKSZ; } else if (P_ISLEAF(opaque)) { pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque), PageGetMaxOffsetNumber(page)); } else { /* root or node */ } } _bt_relbuf(rel, buf); }
/* * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * * *bufP is the current buffer (read-locked and pinned). If we change * pages, it's updated appropriately. * * If successful, update scan's currentItemData and return true. * If no adjacent record exists in the requested direction, * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; BlockNumber blkno; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); /* done if it's not empty */ offnum = P_FIRSTDATAKEY(opaque); if (!PageIsEmpty(page) && offnum <= maxoff) break; } } } } else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility * that the page to our left splits while we are in flight to * it, plus the possibility that the page we were on gets * deleted after we leave it. See nbtree/README for details. */ for (;;) { *bufP = _bt_walk_left(rel, *bufP); /* if we're at end of scan, return failure */ if (*bufP == InvalidBuffer) { ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Okay, we managed to move left to a non-deleted page. * Done if it's not half-dead and not empty. Else loop * back and do it all again. */ if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); offnum = maxoff; if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) break; } } } } /* Update scan state */ so->btso_curbuf = *bufP; blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; }
Datum readindex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; readindexinfo *info; MIRROREDLOCK_BUFMGR_DECLARE; if (SRF_IS_FIRSTCALL()) { Oid irelid = PG_GETARG_OID(0); TupleDesc tupdesc; MemoryContext oldcontext; AttrNumber outattnum; Relation irel; TupleDesc itupdesc; int i; AttrNumber attno; irel = index_open(irelid, AccessShareLock); itupdesc = RelationGetDescr(irel); outattnum = FIXED_COLUMN + itupdesc->natts; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(outattnum, false); attno = 1; TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0); for (i = 0; i < itupdesc->natts; i++) { Form_pg_attribute attr = itupdesc->attrs[i]; TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0); } funcctx->tuple_desc = BlessTupleDesc(tupdesc); info = (readindexinfo *) palloc(sizeof(readindexinfo)); funcctx->user_fctx = (void *) info; info->outattnum = outattnum; info->irel = irel; info->hrel = relation_open(irel->rd_index->indrelid, AccessShareLock); if (info->hrel->rd_rel != NULL && (info->hrel->rd_rel->relstorage == 'a' || info->hrel->rd_rel->relstorage == 'c')) { relation_close(info->hrel, AccessShareLock); info->hrel = NULL; } info->num_pages = RelationGetNumberOfBlocks(irel); info->blkno = BTREE_METAPAGE + 1; info->page = NULL; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); info = (readindexinfo *) funcctx->user_fctx; while (info->blkno < info->num_pages) { Datum values[255]; bool nulls[255]; ItemPointerData itid; HeapTuple tuple; Datum result; if (info->page == NULL) { MIRROREDLOCK_BUFMGR_LOCK; info->buf = ReadBuffer(info->irel, info->blkno); info->page = BufferGetPage(info->buf); info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page); info->minoff = P_FIRSTDATAKEY(info->opaque); info->maxoff = PageGetMaxOffsetNumber(info->page); info->offnum = info->minoff; MIRROREDLOCK_BUFMGR_UNLOCK; } if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff) { ReleaseBuffer(info->buf); info->page = NULL; info->blkno++; continue; } MemSet(nulls, false, info->outattnum * sizeof(bool)); ItemPointerSet(&itid, info->blkno, info->offnum); values[0] = ItemPointerGetDatum(&itid); readindextuple(info, values, nulls); info->offnum = OffsetNumberNext(info->offnum); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } if (info->hrel != NULL) relation_close(info->hrel, AccessShareLock); index_close(info->irel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* * For a newly inserted heap tid, check if an entry with this tid * already exists in a unique index. If it does, abort the inserting * transaction. */ static void _bt_validate_tid(Relation irel, ItemPointer h_tid) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer buf; Page page; BTPageOpaque opaque; IndexTuple itup; OffsetNumber maxoff, minoff, offnum; elog(DEBUG1, "validating tid (%d,%d) for index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { buf = ReadBuffer(irel, blkno); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(irel, buf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&itup->t_tid, h_tid)) { Form_pg_attribute key_att = RelationGetDescr(irel)->attrs[0]; Oid key = InvalidOid; bool isnull; if (key_att->atttypid == OIDOID) { key = DatumGetInt32( index_getattr(itup, 1, RelationGetDescr(irel), &isnull)); elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), NameStr(key_att->attname), key, RelationGetRelationName(irel)); } else { elog(ERROR, "found tid (%d,%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); } } } } ReleaseBuffer(buf); } MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). In * particular, we don't want to risk adding it to freePages twice. */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems() call * per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { _bt_delitems(rel, buf, deletable, ndeletable, true); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems takes care * of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; /* * During VACUUM FULL it's okay to recycle deleted pages immediately, * since there can be no other transactions scanning the index. Note * that we will only recycle the current page and not any parent pages * that _bt_pagedel might have recursed to; this seems reasonable in * the name of simplicity. (Trying to do otherwise would mean we'd * have to sort the list of recyclable pages we're building.) */ if (ndel && info->vacuum_full) { if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * Subroutine to pre-check whether a page deletion is safe, that is, its * parent page would be left in a valid or deletable state. * * "target" is the page we wish to delete, and "stack" is a search stack * leading to it (approximately). Note that we will update the stack * entry(s) to reflect current downlink positions --- this is harmless and * indeed saves later search effort in _bt_pagedel. * * Note: it's OK to release page locks after checking, because a safe * deletion can't become unsafe due to concurrent activity. A non-rightmost * page cannot become rightmost unless there's a concurrent page deletion, * but only VACUUM does page deletion and we only allow one VACUUM on an index * at a time. An only child could acquire a sibling (of the same parent) only * by being split ... but that would make it a non-rightmost child so the * deletion is still safe. */ static bool _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack) { BlockNumber parent; OffsetNumber poffset, maxoff; Buffer pbuf; Page page; BTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * In recovery mode, assume the deletion being replayed is valid. We * can't always check it because we won't have a full search stack, and we * should complain if there's a problem, anyway. */ if (InRecovery) return true; /* Locate the parent's downlink (updating the stack entry if needed) */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_READ); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child. */ if (poffset >= maxoff) { /* It's rightmost child... */ if (poffset == P_FIRSTDATAKEY(opaque)) { /* * It's only child, so safe if parent would itself be removable. * We have to check the parent itself, and then recurse to test * the conditions at the parent's parent. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque)) { _bt_relbuf(rel, pbuf); return false; } _bt_relbuf(rel, pbuf); return _bt_parent_deletion_safe(rel, parent, stack->bts_parent); } else { /* Unsafe to delete */ _bt_relbuf(rel, pbuf); return false; } } else { /* Not rightmost child, so safe to delete */ _bt_relbuf(rel, pbuf); return true; } }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true. (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.) This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey) { Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result, cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. Note * this covers two cases: the page is really empty (no keys), or it * contains only a high key. The latter case is possible after vacuuming. * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key, or first * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ cmpval = nextkey ? 0 : 1; /* select comparison value */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result >= cmpval) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (resp. > * scan key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key (resp. <= scan key). * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
/* * _bt_get_endpoint() -- Find the first or last page on a given tree level * * If the index is empty, we will return InvalidBuffer; any other failure * condition causes ereport(). We will not return a dead page. * * The returned buffer is pinned and read-locked. */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) { Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; /* * If we are looking for a leaf page, okay to descend from fast root; * otherwise better descend from true root. (There is no point in being * smarter about intermediate levels.) */ if (level == 0) buf = _bt_getroot(rel, BT_READ); else buf = _bt_gettrueroot(rel); if (!BufferIsValid(buf)) return InvalidBuffer; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); for (;;) { /* * If we landed on a deleted page, step right to find a live page * (there must be one). Also, if we want the rightmost page, step * right if needed to get to it (this could happen if the page split * since we obtained a pointer to it). */ while (P_IGNORE(opaque) || (rightmost && !P_RIGHTMOST(opaque))) { blkno = opaque->btpo_next; if (blkno == P_NONE) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* Done? */ if (opaque->btpo.level == level) break; if (opaque->btpo.level < level) elog(ERROR, "btree level %u not found in index \"%s\"", level, RelationGetRelationName(rel)); /* Descend to leftmost or rightmost child page */ if (rightmost) offnum = PageGetMaxOffsetNumber(page); else offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } return buf; }
/** * @brief Read the left-most leaf page by walking down on index tree structure * from root node. * * Process flow * -# Open index file and read meta page * -# Get block number of root page * -# Read "fast root" page * -# Read left child page until reaching left-most leaf page * * After calling this function, the members of BTReader are the following: * - smgr : Smgr relation of the existing index file. * - blkno : block number of left-most leaf page. If there is no leaf page, * InvalidBlockNumber is set. * - offnum : InvalidOffsetNumber is set. * - page : Left-most leaf page, or undefined if no leaf page. * * @param reader [in/out] B-Tree index reader * @return true iff there are some tuples */ static bool BTReaderInit(BTReader *reader, Relation rel) { BTPageOpaque metaopaque; BTMetaPageData *metad; BTPageOpaque opaque; BlockNumber blkno; /* * HACK: We cannot use smgropen because smgrs returned from it * will be closed automatically when we assign a new file node. * * XXX: It might be better to open the previous relfilenode with * smgropen *after* RelationSetNewRelfilenode. */ memset(&reader->smgr, 0, sizeof(reader->smgr)); #if PG_VERSION_NUM >= 90100 reader->smgr.smgr_rnode.node = rel->rd_node; reader->smgr.smgr_rnode.backend = rel->rd_backend == MyBackendId ? MyBackendId : InvalidBackendId; #else reader->smgr.smgr_rnode = rel->rd_node; #endif reader->smgr.smgr_which = 0; /* md.c */ reader->blkno = InvalidBlockNumber; reader->offnum = InvalidOffsetNumber; reader->page = palloc(BLCKSZ); /* * Read meta page and check sanity of it. * * XXX: It might be better to do REINDEX against corrupted indexes * instead of raising errors because we've spent long time for data * loading... */ BTReaderReadPage(reader, BTREE_METAPAGE); metaopaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); metad = BTPageGetMeta(reader->page); if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a reader", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d," " code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); if (metad->btm_root == P_NONE) { /* No root page; We ignore the index in the subsequent build. */ reader->blkno = InvalidBlockNumber; return false; } /* Go to the fast root page. */ blkno = metad->btm_fastroot; BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); /* Walk down to the left-most leaf page */ while (!P_ISLEAF(opaque)) { ItemId firstid; IndexTuple itup; /* Get the block number of the left child */ firstid = PageGetItemId(reader->page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(reader->page, firstid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); /* Go down to children */ for (;;) { BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); if (!P_IGNORE(opaque)) break; if (P_RIGHTMOST(opaque)) { /* We reach end of the index without any valid leaves. */ reader->blkno = InvalidBlockNumber; return false; } blkno = opaque->btpo_next; } } return true; }
static void btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page * this is a downlink for. (Like in btree_xlog_insert, this can be done * before locking the other pages) */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = leftsib; ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); /* * On leaf level, the high key of the left page is equal to the first key * on the right page. */ if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); left_hikey = PageGetItem(rpage, hiItemId); left_hikeysz = ItemIdGetLength(hiItemId); } PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. It's not strictly required to retain the same physical * order, as long as the items are in the correct item number order, * but it helps debugging. See also _bt_restore_page(), which does * the same for the right page. */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; Item newitem = NULL; Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; datapos = XLogRecGetBlockData(record, 0, &datalen); if (onleft) { newitem = (Item) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } /* Extract left hikey and its size (assuming 16-bit alignment) */ if (!isleaf) { left_hikey = (Item) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; } Assert(datalen == 0); newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ leftoff = P_HIKEY; if (PageAddItem(newlpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) { ItemId itemid; Size itemsz; Item item; /* add the new item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(lpage, off); itemsz = ItemIdGetLength(itemid); item = PageGetItem(lpage, itemid); if (PageAddItem(newlpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(newlpage, lpage); /* Fix opaque fields */ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } /* We no longer need the buffers */ if (BufferIsValid(lbuf)) UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* * Fix left-link of the page to the right of the new right sibling. * * Note: in normal operation, we do this while still holding lock on the * two split pages. However, that's not necessary for correctness in WAL * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ if (rnext != P_NONE) { Buffer buffer; if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }
Datum readindex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; readindexinfo *info; Relation irel = NULL; Relation hrel = NULL; MIRROREDLOCK_BUFMGR_DECLARE; if (SRF_IS_FIRSTCALL()) { Oid irelid = PG_GETARG_OID(0); TupleDesc tupdesc; MemoryContext oldcontext; AttrNumber outattnum; TupleDesc itupdesc; int i; AttrNumber attno; irel = index_open(irelid, AccessShareLock); itupdesc = RelationGetDescr(irel); outattnum = FIXED_COLUMN + itupdesc->natts; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(outattnum, false); attno = 1; TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0); for (i = 0; i < itupdesc->natts; i++) { Form_pg_attribute attr = itupdesc->attrs[i]; TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0); } funcctx->tuple_desc = BlessTupleDesc(tupdesc); info = (readindexinfo *) palloc(sizeof(readindexinfo)); funcctx->user_fctx = (void *) info; info->outattnum = outattnum; info->ireloid = irelid; hrel = relation_open(irel->rd_index->indrelid, AccessShareLock); if (hrel->rd_rel != NULL && (hrel->rd_rel->relstorage == 'a' || hrel->rd_rel->relstorage == 'c')) { relation_close(hrel, AccessShareLock); hrel = NULL; info->hreloid = InvalidOid; } else info->hreloid = irel->rd_index->indrelid; info->num_pages = RelationGetNumberOfBlocks(irel); info->blkno = BTREE_METAPAGE + 1; info->page = NULL; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); info = (readindexinfo *) funcctx->user_fctx; /* * Open the relations (on first call, we did that above already). * We unfortunately have to look up the relcache entry on every call, * because if we store it in the cross-call context, we won't get a * chance to release it if the function isn't run to completion, * e.g. because of a LIMIT clause. We only lock the relation on the * first call, and keep the lock until completion, however. */ if (!irel) irel = index_open(info->ireloid, NoLock); if (!hrel && info->hreloid != InvalidOid) hrel = heap_open(info->hreloid, NoLock); while (info->blkno < info->num_pages) { Datum values[255]; bool nulls[255]; ItemPointerData itid; HeapTuple tuple; Datum result; if (info->page == NULL) { Buffer buf; /* * Make copy of the page, because we cannot hold a buffer pin * across calls (we wouldn't have a chance to release it, if the * function isn't run to completion.) */ info->page = palloc(BLCKSZ); MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(irel, info->blkno); memcpy(info->page, BufferGetPage(buf), BLCKSZ); ReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page); info->minoff = P_FIRSTDATAKEY(info->opaque); info->maxoff = PageGetMaxOffsetNumber(info->page); info->offnum = info->minoff; } if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff) { pfree(info->page); info->page = NULL; info->blkno++; continue; } MemSet(nulls, false, info->outattnum * sizeof(bool)); ItemPointerSet(&itid, info->blkno, info->offnum); values[0] = ItemPointerGetDatum(&itid); readindextuple(info, irel, hrel, values, nulls); info->offnum = OffsetNumberNext(info->offnum); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); if (hrel != NULL) heap_close(hrel, NoLock); index_close(irel, NoLock); SRF_RETURN_NEXT(funcctx, result); } if (hrel != NULL) heap_close(hrel, AccessShareLock); index_close(irel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* * _bt_pagedel() -- Delete a page from the b-tree. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and read-locked. This lock and * pin will be dropped before exiting. * * Returns the number of pages successfully deleted (zero on failure; could * be more than one if parent blocks were deleted). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) { BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; BTItem targetkey, btitem; ScanKey itup_scankey; BTStack stack; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid)); /* * We need to get an approximate pointer to the page's parent page. Use * the standard search mechanism to search for the page's high key; this * will give us a link to either the current parent or someplace to its * left (if there are multiple equal high keys). To avoid deadlocks, we'd * better drop the target page lock first. */ _bt_relbuf(rel, buf); /* we need a scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup)); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did more than * we needed. Locate the stack item pointing to our parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. However, * it's possible for the search to fail (for reasons explained in README). * If that happens, we recover by searching the whole parent level, which * is a tad inefficient but doesn't happen often enough to be a problem. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) { /* Find the leftmost page in the parent level */ pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; _bt_relbuf(rel, pbuf); /* and repeat search from there */ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); } parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, rbuf); _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); btitem = (BTItem) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) == target); ItemPointerSet(&(btitem->bti_itup.t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); /* This part is just for double-checking */ itemid = PageGetItemId(page, nextoffset); btitem = (BTItem) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) != rightsib) elog(PANIC, "right sibling is not next child in \"%s\"", RelationGetRelationName(rel)); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone; or immediately if we're doing VACUUM FULL. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = vacuum_full ? FrozenTransactionId : ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; } /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* Write and release buffers */ if (BufferIsValid(metabuf)) _bt_wrtbuf(rel, metabuf); _bt_wrtbuf(rel, pbuf); _bt_wrtbuf(rel, rbuf); _bt_wrtbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_wrtbuf(rel, lbuf); /* * If parent became half dead, recurse to try to delete it. Otherwise, if * right sibling is empty and is now the last child of the parent, recurse * to try to delete it. (These cases cannot apply at the same time, * though the second case might itself recurse to the first.) */ if (parent_half_dead) { buf = _bt_getbuf(rel, parent, BT_READ); return _bt_pagedel(rel, buf, vacuum_full) + 1; } if (parent_one_child && rightsib_empty) { buf = _bt_getbuf(rel, rightsib, BT_READ); return _bt_pagedel(rel, buf, vacuum_full) + 1; } return 1; }
/* * Test whether an indextuple satisfies all the scankey conditions. * * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. * If not, return FALSE (xs_ctup is not changed). * * If the tuple fails to pass the qual, we also determine whether there's * any need to continue the scan beyond this tuple, and set *continuescan * accordingly. See comments for bt_preproc_keys(), above, about how * this is done. * * scan: index scan descriptor (containing a search-type scankey) * page: buffer page containing index tuple * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in * continuescan: output parameter (will be set correctly in all cases) */ bool bt_check_keys( struct index_scan* scan, page_p page, item_id_t offnum, enum scandir dir, bool* continuescan) { struct item_id *iid; bool tuple_valid; struct index_tuple *tuple; struct tuple *tupdesc; struct bt_scan_opaque *so; int keysz; int ikey; struct scankey *key; iid = PAGE_ITEM_ID(page, offnum); *continuescan = true; /* default assumption */ /* * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. Most of the time, it's a win to * not bother examining the tuple's index keys, but just return * immediately with continuescan = true to proceed to the next tuple. * However, if this is the last tuple on the page, we should check the * index keys to prevent uselessly advancing to the next page. */ if (scan->ignore_killed_tuples && ITEMID_DEAD(iid)) { /* return immediately if there are more tuples on the page */ if (SCANDIR_FORWARD(dir)) { if (offnum < PAGE_MAX_ITEM_ID(page)) return false; } else { struct bt_page_opaque *opaque; opaque = (struct bt_page_opaque *) PAGE_SPECIAL_PTR(page); if (offnum > P_FIRSTDATAKEY(opaque)) return false; } /* * OK, we want to check the keys, but we'll return FALSE even if the * tuple passes the key tests. */ tuple_valid = false; } else tuple_valid = true; tuple = (struct index_tuple*) PAGE_GET_ITEM(page, iid); tupdesc = REL_DESC(scan->indexRelation); so = (struct bt_scan_opaque*) scan->opaque; keysz = so->numberOfKeys; for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) { datum_t datum; bool isNull; datum_t test; /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { if (bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) continue; return false; } datum = index_getattr(tuple, key->sk_attno, tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* Handle IS NULL/NOT NULL tests */ if (key->sk_flags & SK_SEARCHNULL) { if (isNull) continue; /* tuple satisfies this qual */ } else { ASSERT(key->sk_flags & SK_SEARCHNOTNULL); if (!isNull) continue; /* tuple satisfies this qual */ } /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. */ if ((key->sk_flags & SK_BT_REQFWD) && SCANDIR_FORWARD(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && SCANDIR_BACKWARD(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } if (isNull) { if (key->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual * is one of the "must match" subset. On a forward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQBKWD) && SCANDIR_BACKWARD(dir)) *continuescan = false; } else { /* * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is * one of the "must match" subset. On a backward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQFWD) && SCANDIR_FORWARD(dir)) *continuescan = false; } /* * In any case, this indextuple doesn't match the qual. */ return false; } test = fc_2coll(&key->sk_func, key->sk_collation, datum, key->sk_argument); if (!D_TO_BOOL(test)) { /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. * * Note: because we stop the scan as soon as any required equality * qual fails, it is critical that equality quals be used for the * initial positioning in _bt_first() when they are available. See * comments in _bt_first(). */ if ((key->sk_flags & SK_BT_REQFWD) && SCANDIR_FORWARD(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && SCANDIR_BACKWARD(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } } /* If we get here, the tuple passes all index quals. */ if (tuple_valid) scan->xs_ctup.t_self = tuple->t_tid; return tuple_valid; }
/* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed * * scan->so contains information about the current page and killed tuples * thereon (generally, this should only be called if so->numKilled > 0). * * The caller must have pin on so->currPos.buf, but may or may not have * read-lock, as indicated by haveLock. Note that we assume read-lock * is sufficient for setting LP_DEAD status (which is only a hint). * * We match items by heap TID before assuming they are the right ones to * delete. We cope with cases where items have moved right due to insertions. * If an item has moved off the current page due to a split, we'll fail to * find it and do nothing (this is not an error case --- we assume the item * will eventually get marked in a future indexscan). Note that because we * hold pin on the target page continuously from initially reading the items * until applying this function, VACUUM cannot have deleted any items from * the page, and so there is no need to search left from the recorded offset. * (This observation also guarantees that the item is still the right one * to delete, which might otherwise be questionable since heap TIDs can get * recycled.) */ void _bt_killitems(struct index_scan *scan, bool haveLock) { struct bt_scan_opaque *so; page_p page; struct bt_page_opaque *opaque; item_id_t minoff; item_id_t maxoff; int i; bool killedsomething = false; so = (struct bt_scan_opaque *)scan->opaque; ASSERT(BUF_VALID(so->currPos.buf)); if (!haveLock) lock_buf(so->currPos.buf, BT_READ); page = BUF_PAGE(so->currPos.buf); opaque = (struct bt_page_opaque *)PAGE_SPECIAL_PTR(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PAGE_MAX_ITEM_ID(page); for (i = 0; i < so->numKilled; i++) { int itemIndex = so->killedItems[i]; struct bt_scan_item *kitem = &so->currPos.items[itemIndex]; item_id_t offnum = kitem->indexOffset; ASSERT(itemIndex >= so->currPos.firstItem && itemIndex <= so->currPos.lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) { struct item_id *iid; struct index_tuple *ituple; iid = PAGE_ITEM_ID(page, offnum); ituple = (struct index_tuple *)PAGE_GET_ITEM(page, iid); if (item_ptr_eq(&ituple->t_tid, &kitem->heapTid)) { /* found the item */ ITEMID_MARK_DEAD(iid); killedsomething = true; break; /* out of inner search loop */ } offnum = ITEM_ID_NEXT(offnum); } } /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty * but don't make a WAL log entry. * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. */ if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; set_buf_commit_needs_save(so->currPos.buf); } if (!haveLock) lock_buf(so->currPos.buf, BUF_LOCK_UNLOCK); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* * _bt_readpage() -- Load data from current index page into so->currPos * * Caller must have pinned and read-locked so->currPos.buf; the buffer's state * is not changed here. Also, currPos.moreLeft and moreRight must be valid; * they are updated as appropriate. All other fields of so->currPos are * initialized from scratch here. * * We scan the current page starting at offnum and moving in the indicated * direction. All items matching the scan keys are loaded into currPos.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports * that there can be no more matching tuples in the current scan direction. * * Returns true if any matching items found on the page, false if none. */ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int itemIndex; IndexTuple itup; bool continuescan; /* we must have the buffer pinned and locked */ Assert(BufferIsValid(so->currPos.buf)); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); /* * we must save the page's right-link while scanning it; this tells us * where to step right to after we're done with these items. There is no * corresponding need for the left-link, since splits always go right. */ so->currPos.nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ itemIndex = 0; offnum = Max(offnum, minoff); while (offnum <= maxoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ _bt_saveitem(so, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreRight = false; break; } offnum = OffsetNumberNext(offnum); } Assert(itemIndex <= MaxIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; } else { /* load items[] in descending order */ itemIndex = MaxIndexTuplesPerPage; offnum = Min(offnum, maxoff); while (offnum >= minoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ itemIndex--; _bt_saveitem(so, itemIndex, offnum, itup); } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; break; } offnum = OffsetNumberPrev(offnum); } Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; so->currPos.lastItem = MaxIndexTuplesPerPage - 1; so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); }
/* * _bt_steppage() -- Step to next page containing valid data for scan * * On entry, so->currPos.buf must be pinned and read-locked. We'll drop * the lock and pin before moving to next page. * * On success exit, we hold pin and read-lock on the next interesting page, * and so->currPos is updated to contain data from that page. * * If there are no more matching records in the given direction, we drop all * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel; Page page; BTPageOpaque opaque; /* we must have the buffer pinned and locked */ Assert(BufferIsValid(so->currPos.buf)); /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan, true); /* * Before we modify currPos, make a copy of the page data if there was a * mark position that needs it. */ if (so->markItemIndex >= 0) { /* bump pin on current buffer for assignment to mark buffer */ IncrBufferRefCount(so->currPos.buf); memcpy(&so->markPos, &so->currPos, offsetof(BTScanPosData, items[1]) + so->currPos.lastItem * sizeof(BTScanPosItem)); if (so->markTuples) memcpy(so->markTuples, so->currTuples, so->currPos.nextTupleOffset); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; } rel = scan->indexRelation; if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ /* We must rely on the previously saved nextPage link! */ BlockNumber blkno = so->currPos.nextPage; /* Remember we left a page with data */ so->currPos.moreLeft = true; for (;;) { /* release the previous buffer */ _bt_relbuf(rel, so->currPos.buf); so->currPos.buf = InvalidBuffer; /* if we're at end of scan, give up */ if (blkno == P_NONE || !so->currPos.moreRight) return false; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); /* step right one page */ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); /* check for deleted page */ page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) break; } /* nope, keep going */ blkno = opaque->btpo_next; } } else { /* Remember we left a page with data */ so->currPos.moreRight = true; /* * Walk left to the next page with data. This is much more complex * than the walk-right case because of the possibility that the page * to our left splits while we are in flight to it, plus the * possibility that the page we were on gets deleted after we leave * it. See nbtree/README for details. */ for (;;) { /* Done if we know there are no matching keys to the left */ if (!so->currPos.moreLeft) { _bt_relbuf(rel, so->currPos.buf); so->currPos.buf = InvalidBuffer; return false; } /* Step to next physical page */ so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) return false; /* * Okay, we managed to move left to a non-deleted page. Done if * it's not half-dead and contains matching tuples. Else loop back * and do it all again. */ page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) break; } } } return true; }
/* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed * * scan->so contains information about the current page and killed tuples * thereon (generally, this should only be called if so->numKilled > 0). * * The caller must have pin on so->currPos.buf, but may or may not have * read-lock, as indicated by haveLock. Note that we assume read-lock * is sufficient for setting LP_DEAD status (which is only a hint). * * We match items by heap TID before assuming they are the right ones to * delete. We cope with cases where items have moved right due to insertions. * If an item has moved off the current page due to a split, we'll fail to * find it and do nothing (this is not an error case --- we assume the item * will eventually get marked in a future indexscan). Note that because we * hold pin on the target page continuously from initially reading the items * until applying this function, VACUUM cannot have deleted any items from * the page, and so there is no need to search left from the recorded offset. * (This observation also guarantees that the item is still the right one * to delete, which might otherwise be questionable since heap TIDs can get * recycled.) */ void _bt_killitems(IndexScanDesc scan, bool haveLock) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int i; bool killedsomething = false; Assert(BufferIsValid(so->currPos.buf)); if (!haveLock) LockBuffer(so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (i = 0; i < so->numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; Assert(itemIndex >= so->currPos.firstItem && itemIndex <= so->currPos.lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) { /* found the item */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ } offnum = OffsetNumberNext(offnum); } } /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty * but don't make a WAL log entry. * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. */ if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; SetBufferCommitInfoNeedsSave(so->currPos.buf); } if (!haveLock) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* * _bt_endpoint() -- Find the first or last page in the index, and scan * from there to the first key satisfying all the quals. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). Exit conditions are the * same as for _bt_first(). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* * Empty index. Lock the whole relation, as nothing finer to lock * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); so->currPos.buf = InvalidBuffer; return false; } PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } /* remember which buffer we have pinned */ so->currPos.buf = buf; /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; } else { so->currPos.moreLeft = true; so->currPos.moreRight = false; } so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ /* * Now load data from the first page of the scan. */ if (!_bt_readpage(scan, dir, start)) { /* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ if (!_bt_steppage(scan, dir)) return false; } /* Drop the lock, but not pin, on the current page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/* * Test whether an indextuple satisfies all the scankey conditions. * * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. * If not, return FALSE (xs_ctup is not changed). * * If the tuple fails to pass the qual, we also determine whether there's * any need to continue the scan beyond this tuple, and set *continuescan * accordingly. See comments for _bt_preprocess_keys(), above, about how * this is done. * * scan: index scan descriptor (containing a search-type scankey) * page: buffer page containing index tuple * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in * continuescan: output parameter (will be set correctly in all cases) */ bool _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan) { ItemId iid = PageGetItemId(page, offnum); bool tuple_valid; IndexTuple tuple; TupleDesc tupdesc; BTScanOpaque so; int keysz; int ikey; ScanKey key; *continuescan = true; /* default assumption */ /* * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. Most of the time, it's a win to * not bother examining the tuple's index keys, but just return * immediately with continuescan = true to proceed to the next tuple. * However, if this is the last tuple on the page, we should check the * index keys to prevent uselessly advancing to the next page. */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { /* return immediately if there are more tuples on the page */ if (ScanDirectionIsForward(dir)) { if (offnum < PageGetMaxOffsetNumber(page)) return false; } else { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (offnum > P_FIRSTDATAKEY(opaque)) return false; } /* * OK, we want to check the keys, but we'll return FALSE even if the * tuple passes the key tests. */ tuple_valid = false; } else tuple_valid = true; tuple = (IndexTuple) PageGetItem(page, iid); IncrIndexProcessed(); tupdesc = RelationGetDescr(scan->indexRelation); so = (BTScanOpaque) scan->opaque; keysz = so->numberOfKeys; for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) { Datum datum; bool isNull; Datum test; /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) continue; return false; } datum = index_getattr(tuple, key->sk_attno, tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* Handle IS NULL tests */ Assert(key->sk_flags & SK_SEARCHNULL); if (isNull) continue; /* tuple satisfies this qual */ /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } if (isNull) { if (key->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual * is one of the "must match" subset. On a forward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; } else { /* * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is * one of the "must match" subset. On a backward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; } /* * In any case, this indextuple doesn't match the qual. */ return false; } test = FunctionCall2(&key->sk_func, datum, key->sk_argument); if (!DatumGetBool(test)) { /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. * * Note: because we stop the scan as soon as any required equality * qual fails, it is critical that equality quals be used for the * initial positioning in _bt_first() when they are available. See * comments in _bt_first(). */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } } /* If we get here, the tuple passes all index quals. */ if (tuple_valid) scan->xs_ctup.t_self = tuple->t_tid; return tuple_valid; }
/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * keysz: number of key conditions to be checked (might be less than the * number of index columns!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */ int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal page * --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); /* * The scan key is set up with the attribute number associated with each * term in the key. It is important that, if the index is multi-key, the * scan contain the first k key attributes, and that they be in order. If * you think about how multi-key ordering works, you'll understand why * this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right (see * _bt_first). */ for (i = 1; i <= keysz; i++) { Datum datum; bool isNull; int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ else result = -1; /* NOT_NULL "<" NULL */ } else { /* * The sk_func needs to be passed the index value as left arg and * the sk_argument as right arg (they might be of different * types). Since it is convenient for callers to think of * _bt_compare as comparing the scankey to the index item, we have * to flip the sign of the comparison result. (Unless it's a DESC * column, in which case we *don't* flip the sign.) */ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, scankey->sk_collation, datum, scankey->sk_argument)); if (!(scankey->sk_flags & SK_BT_DESC)) result = -result; } /* if the keys are unequal, return the difference */ if (result != 0) return result; scankey++; } /* if we get here, the keys are equal */ return 0; }
/* * _bt_endpoint() -- Find the first or last key in the index. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel; Buffer buf; Page page; BTPageOpaque opaque; ItemPointer current; OffsetNumber maxoff; OffsetNumber start; BlockNumber blkno; BTItem btitem; IndexTuple itup; BTScanOpaque so; bool res; bool continuescan; rel = scan->indexRelation; current = &(scan->currentItemData); so = (BTScanOpaque) scan->opaque; /* * Scan down to the leftmost or rightmost leaf page. This is a * simplified version of _bt_search(). We don't maintain a stack * since we know we won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* empty index... */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty * page */ start = P_FIRSTDATAKEY(opaque); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } ItemPointerSet(current, blkno, start); /* remember which buffer we have pinned */ so->btso_curbuf = buf; /* * Left/rightmost page could be empty due to deletions, if so step * till we find a nonempty page. */ if (start > maxoff) { if (!_bt_step(scan, &buf, dir)) return false; start = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); itup = &(btitem->bti_itup); /* see if we picked a winner */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } return res; }
/* * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write * lock is OK). This lock and pin will be dropped before exiting. * * The "stack" argument can be a search stack leading (approximately) to the * target page, or NULL --- outside callers typically pass NULL since they * have not done such a search, but internal recursion cases pass the stack * to avoid duplicated search effort. * * Returns the number of pages successfully deleted (zero if page cannot * be deleted now; could be more than one if parent pages were deleted too). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full) { int result; BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; IndexTuple targetkey, itup; ScanKey itup_scankey; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); /* * To avoid deadlocks, we'd better drop the target page lock before going * further. */ _bt_relbuf(rel, buf); /* * We need an approximate pointer to the page's parent page. We use the * standard search mechanism to search for the page's high key; this will * give us a link to either the current parent or someplace to its left * (if there are multiple equal high keys). In recursion cases, the * caller already generated a search stack and we can just re-use that * work. */ if (stack == NULL) { if (!InRecovery) { /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did * more than we needed. Locate the stack item pointing to our * parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } } else { /* * During WAL recovery, we can't use _bt_search (for one reason, * it might invoke user-defined comparison functions that expect * facilities not available in recovery mode). Instead, just set * up a dummy stack pointing to the left end of the parent tree * level, from which _bt_getstackbuf will walk right to the parent * page. Painful, but we don't care too much about performance in * this scenario. */ pbuf = _bt_get_endpoint(rel, targetlevel + 1, false); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; /* bts_btentry will be initialized below */ stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } } /* * We cannot delete a page that is the rightmost child of its immediate * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete. We * don't need to re-test when deleting a non-leaf page, though. */ if (targetlevel == 0 && !_bt_parent_deletion_safe(rel, target, stack)) return 0; /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != target) elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", rightsib, opaque->btpo_prev, target, RelationGetRelationName(rel)); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. The "can't delete" case should have been * detected by _bt_parent_deletion_safe, so complain if we see it now. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"", target, parent, RelationGetRelationName(rel)); } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We don't support handling this in the case where the parent is becoming * half-dead, even though it theoretically could occur. * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE && !parent_half_dead) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Check that the parent-page index items we're about to delete/overwrite * contain what we expect. This can fail if the index has become * corrupt for some reason. We want to throw any error before entering * the critical section --- otherwise it'd be a PANIC. * * The test on the target item is just an Assert because _bt_getstackbuf * should have guaranteed it has the expected contents. The test on the * next-child downlink is known to sometimes fail in the field, though. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); #endif if (!parent_half_dead) { OffsetNumber nextoffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), parent, RelationGetRelationName(rel)); } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. Asserts here are just rechecking * things we already verified above. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone; or immediately if we're doing VACUUM FULL. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = vacuum_full ? FrozenTransactionId : ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); } /* Must mark buffers dirty before XLogInsert */ MarkBufferDirty(pbuf); MarkBufferDirty(rbuf); MarkBufferDirty(buf); if (BufferIsValid(lbuf)) MarkBufferDirty(lbuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xl_btreetid_set(&(xlrec.target), rel, parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else if (parent_half_dead) xlinfo = XLOG_BTREE_DELETE_PAGE_HALF; else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* release metapage; send out relcache inval if metapage changed */ if (BufferIsValid(metabuf)) { CacheInvalidateRelcache(rel); _bt_relbuf(rel, metabuf); } /* can always release leftsib immediately */ if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); /* * If parent became half dead, recurse to delete it. Otherwise, if right * sibling is empty and is now the last child of the parent, recurse to * try to delete it. (These cases cannot apply at the same time, though * the second case might itself recurse to the first.) * * When recursing to parent, we hold the lock on the target page until * done. This delays any insertions into the keyspace that was just * effectively reassigned to the parent's right sibling. If we allowed * that, and there were enough such insertions before we finish deleting * the parent, page splits within that keyspace could lead to inserting * out-of-order keys into the grandparent level. It is thought that that * wouldn't have any serious consequences, but it still seems like a * pretty bad idea. */ if (parent_half_dead) { /* recursive call will release pbuf */ _bt_relbuf(rel, rbuf); result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1; _bt_relbuf(rel, buf); } else if (parent_one_child && rightsib_empty) { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); /* recursive call will release rbuf */ result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1; } else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); _bt_relbuf(rel, rbuf); result = 1; } return result; }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey. (NOTE: in particular, this means it is possible * to return a value 1 greater than the number of keys on the page, * if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey. (Since _bt_compare treats the first * data key of such a page as minus infinity, there will be at least one * key < scankey, so the result always points at one of the keys on the * page.) This key indicates the right place to descend to be sure we * find all leaf keys >= given scankey. * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey) { TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. * Note this covers two cases: the page is really empty (no keys), or * it contains only a high key. The latter case is possible after * vacuuming. This can never happen on an internal page, however, * since they are never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key. Loop * invariant: all slots before 'low' are < scan key, all slots at or * after 'high' are >= scan key. We can fall out when high == low. */ high++; /* establish the loop invariant for high */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result > 0) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (which * could be the last slot + 1). */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key. There must be * one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
/* * Post vacuum, iterate over all entries in index, check if the h_tid * of each entry exists and is not dead. For specific system tables, * also ensure that the key in index entry matches the corresponding * attribute in the heap tuple. */ void _bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer ibuf = InvalidBuffer; Buffer hbuf = InvalidBuffer; Page ipage; BTPageOpaque opaque; IndexTuple itup; HeapTupleData htup; OffsetNumber maxoff, minoff, offnum; Oid ioid, hoid; bool isnull; blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); elog(LOG, "btvalidatevacuum: index %s, heap %s", RelationGetRelationName(irel), RelationGetRelationName(hrel)); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { ibuf = ReadBuffer(irel, blkno); ipage = BufferGetPage(ibuf); opaque = (BTPageOpaque) PageGetSpecialPointer(ipage); if (!PageIsNew(ipage)) _bt_checkpage(irel, ibuf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(ipage); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(ipage, PageGetItemId(ipage, offnum)); ItemPointerCopy(&itup->t_tid, &htup.t_self); /* * TODO: construct a tid bitmap based on index tids * and fetch heap tids in order afterwards. That will * also allow validating if a heap tid appears twice * in a unique index. */ if (!heap_release_fetch(hrel, SnapshotAny, &htup, &hbuf, true, NULL)) { elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s " "not found in heap %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel), RelationGetRelationName(hrel)); } switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf)) { case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: /* these tuples are considered alive by vacuum */ break; case HEAPTUPLE_DEAD: elog(ERROR, "btvalidatevacuum: vacuum did not remove " "dead tuple (%d,%d) from heap %s and index %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(hrel), RelationGetRelationName(irel)); break; default: elog(ERROR, "btvalidatevacuum: invalid visibility"); break; } switch(RelationGetRelid(irel)) { case DatabaseOidIndexId: case TypeOidIndexId: case ClassOidIndexId: case ConstraintOidIndexId: hoid = HeapTupleGetOid(&htup); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; case GpRelationNodeOidIndexId: hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull); int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; default: break; } if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE) { int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } } } } if (BufferIsValid(ibuf)) ReleaseBuffer(ibuf); } if (BufferIsValid(hbuf)) ReleaseBuffer(hbuf); MIRROREDLOCK_BUFMGR_UNLOCK; }
/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * keysz: number of key conditions to be checked (might be less than the * total length of the scan key!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */ int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); BTItem btitem; IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal * page --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &(btitem->bti_itup); /* * The scan key is set up with the attribute number associated with * each term in the key. It is important that, if the index is * multi-key, the scan contain the first k key attributes, and that * they be in order. If you think about how multi-key ordering works, * you'll understand why this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right * (see _bt_first). */ for (i = 0; i < keysz; i++) { ScanKey entry = &scankey[i]; Datum datum; bool isNull; int32 result; datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (entry->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { result = -1; /* NOT_NULL "<" NULL */ } else { result = DatumGetInt32(FunctionCall2(&entry->sk_func, entry->sk_argument, datum)); } /* if the keys are unequal, return the difference */ if (result != 0) return result; } /* if we get here, the keys are equal */ return 0; }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque = NULL; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); if (!PageIsNew(page)) { _bt_checkpage(rel, buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Remember highest leaf page number we've taken cleanup lock on; see * notes in btvacuumscan */ if (blkno > vstate->lastBlockLocked) vstate->lastBlockLocked = blkno; /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes * all information to the replay code to allow it to get a cleanup * lock on all pages between the previous lastBlockVacuumed and * this page. This ensures that WAL replay locks all leaf pages at * some point, which is important should non-MVCC scans be * requested. This is currently unused on standby, but we record * it anyway, so that the WAL contains the required information. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Remember highest leaf page number we've issued a * XLOG_BTREE_VACUUM WAL record for. */ if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; MarkBufferDirtyHint(buf, true); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* checks the individual attributes of the tuple */ uint32 check_index_tuple_attributes(Relation rel, PageHeader header, int block, OffsetNumber offnum, char *buffer, int dlen) { IndexTuple tuple; uint32 nerrs = 0; int j, off; bits8 * bitmap; BTPageOpaque opaque; ItemId linp; ereport(DEBUG2,(errmsg("[%d:%d] checking attributes for the tuple", block, offnum))); /* get the index tuple and info about the page */ linp = &header->pd_linp[offnum - 1]; tuple = (IndexTuple)(buffer + linp->lp_off); opaque = (BTPageOpaque)(buffer + header->pd_special); /* current attribute offset - always starts at (buffer + off) */ off = linp->lp_off + IndexInfoFindDataOffset(tuple->t_info); ereport(DEBUG3,(errmsg("[%d:%d] tuple has %d attributes", block, offnum, RelationGetNumberOfAttributes(rel)))); /* XXX: MAXALIGN */ bitmap = (bits8*)(buffer + linp->lp_off + sizeof(IndexTupleData)); /* TODO This is mostly copy'n'paste from check_heap_tuple_attributes, so maybe it could be refactored to share the code. */ /* For non-leaf pages, the first data tuple may or may not actually have any data. See src/backend/access/nbtree/README, "Notes About Data Representation". */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque) && dlen == 0) { ereport(DEBUG3, (errmsg("[%d:%d] first data key tuple on non-leaf block => no data, skipping", block, offnum))); return nerrs; } /* check all the index attributes */ for (j = 0; j < rel->rd_att->natts; j++) { /* default length of the attribute */ int len = rel->rd_att->attrs[j]->attlen; /* copy from src/backend/commands/analyze.c */ bool is_varlena = (!rel->rd_att->attrs[j]->attbyval && len == -1); bool is_varwidth = (!rel->rd_att->attrs[j]->attbyval && len < 0); /* thus it's "len = -2" */ /* if the attribute is marked as NULL (in the tuple header), skip to the next attribute */ if (IndexTupleHasNulls(tuple) && att_isnull(j, bitmap)) { ereport(DEBUG3, (errmsg("[%d:%d] attribute '%s' is NULL (skipping)", block, offnum, rel->rd_att->attrs[j]->attname.data))); continue; } /* fix the alignment (see src/include/access/tupmacs.h) */ off = att_align_pointer(off, rel->rd_att->attrs[j]->attalign, rel->rd_att->attrs[j]->attlen, buffer+off); if (is_varlena) { /* other interesting macros (see postgres.h) - should do something about those ... VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) */ len = VARSIZE_ANY(buffer + off); if (len < 0) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has negative length < 0 (%d)", block, offnum, rel->rd_att->attrs[j]->attname.data, len))); ++nerrs; break; } if (VARATT_IS_COMPRESSED(buffer + off)) { /* the raw length should be less than 1G (and positive) */ if ((VARRAWSIZE_4B_C(buffer + off) < 0) || (VARRAWSIZE_4B_C(buffer + off) > 1024*1024)) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has invalid length %d (should be between 0 and 1G)", block, offnum, rel->rd_att->attrs[j]->attname.data, VARRAWSIZE_4B_C(buffer + off)))); ++nerrs; /* no break here, this does not break the page structure - we may check the other attributes */ } } /* FIXME Check if the varlena value may be detoasted. */ } else if (is_varwidth) { /* get the C-string length (at most to the end of tuple), +1 as it does not include '\0' at the end */ /* if the string is not properly terminated, then this returns 'remaining space + 1' so it's detected */ len = strnlen(buffer + off, linp->lp_off + len + linp->lp_len - off) + 1; } /* Check if the length makes sense (is not negative and does not overflow * the tuple end, stop validating the other rows (we don't know where to * continue anyway). */ if ((dlen > 0) && (off + len > (linp->lp_off + linp->lp_len))) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' (off=%d len=%d) overflows tuple end (off=%d, len=%d)", block, offnum, rel->rd_att->attrs[j]->attname.data, off, len, linp->lp_off, linp->lp_len))); ++nerrs; break; } /* skip to the next attribute */ off += (dlen > 0) ? len : 0; ereport(DEBUG3,(errmsg("[%d:%d] attribute '%s' len=%d", block, offnum, rel->rd_att->attrs[j]->attname.data, len))); } ereport(DEBUG3, (errmsg("[%d:%d] last attribute ends at %d, tuple ends at %d", block, offnum, off, linp->lp_off + linp->lp_len))); /* after the last attribute, the offset should be less than the end of the tuple */ if (MAXALIGN(off) > linp->lp_off + linp->lp_len) { ereport(WARNING, (errmsg("[%d:%d] the last attribute ends at %d but the tuple ends at %d", block, offnum, off, linp->lp_off + linp->lp_len))); ++nerrs; } return nerrs; }