/* * Mask a btree page before performing consistency checks on it. */ void btree_mask(char *pagedata, BlockNumber blkno) { Page page = (Page) pagedata; BTPageOpaque maskopaq; mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); mask_unused_space(page); maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(maskopaq)) { /* * Mask page content on a DELETED page since it will be re-initialized * during replay. See btree_xlog_unlink_page() for details. */ mask_page_content(page); } else if (P_ISLEAF(maskopaq)) { /* * In btree leaf pages, it is possible to modify the LP_FLAGS without * emitting any WAL record. Hence, mask the line pointer flags. See * _bt_killitems(), _bt_check_unique() for details. */ mask_lp_flags(page); } /* * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See * _bt_killitems(), _bt_check_unique() for details. */ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; /* * During replay of a btree page split, we don't set the BTP_SPLIT_END * flag of the right sibling and initialize the cycle_id to 0 for the same * page. See btree_xlog_split() for details. */ maskopaq->btpo_flags &= ~BTP_SPLIT_END; maskopaq->btpo_cycleid = 0; }
/* * Add an item to a page being built. * * The main difference between this routine and a bare PageAddItem call * is that this code knows that the leftmost data item on a non-leaf * btree page doesn't need to have a key. Therefore, it strips such * items down to just the item header. * * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use * that because it assumes that P_RIGHTMOST() will return the correct * answer for the page. Here, we don't know yet if the page will be * rightmost. Offset P_FIRSTKEY is always the first data key. */ static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off) { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTupleData trunctuple; if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); itup = &trunctuple; itemsize = sizeof(IndexTupleData); } if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to the index page"); }
/* * pgstat_btree_page -- check tuples in a btree page */ static void pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, BufferAccessStrategy bstrategy) { Buffer buf; Page page; buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); /* Page is valid, see what to do with it */ if (PageIsNew(page)) { /* fully empty page */ stat->free_space += BLCKSZ; } else { BTPageOpaque opaque; opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD)) { /* recyclable page */ stat->free_space += BLCKSZ; } else if (P_ISLEAF(opaque)) { pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque), PageGetMaxOffsetNumber(page)); } else { /* root or node */ } } _bt_relbuf(rel, buf); }
/* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, int access) { BTStack stack_in = NULL; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) return (BTStack) NULL; /* Loop iterates once per level descended in the tree */ for (;;) { Page page; BTPageOpaque opaque; OffsetNumber offnum; ItemId itemid; BTItem btitem; IndexTuple itup; BlockNumber blkno; BlockNumber par_blkno; BTStack new_stack; /* * Race -- the page we just grabbed may have split since we read * its pointer in the parent (or metapage). If it has, we may * need to move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; /* * Find the appropriate item on the internal page, and get the * child page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey); itemid = PageGetItemId(page, offnum); btitem = (BTItem) PageGetItem(page, itemid); itup = &(btitem->bti_itup); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); par_blkno = BufferGetBlockNumber(*bufP); /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use * the stack to work back up to the parent page. We also save the * actual downlink (TID) to uniquely identify the index entry, in * case it moves right while we're working lower in the tree. See * the paper by Lehman and Yao for how this is detected and * handled. (We use the child link to disambiguate duplicate keys * in the index -- Lehman and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData)); new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); /* okay, all set to move down a level */ stack_in = new_stack; } return stack_in; }
/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * keysz: number of key conditions to be checked (might be less than the * total length of the scan key!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */ int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); BTItem btitem; IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal * page --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &(btitem->bti_itup); /* * The scan key is set up with the attribute number associated with * each term in the key. It is important that, if the index is * multi-key, the scan contain the first k key attributes, and that * they be in order. If you think about how multi-key ordering works, * you'll understand why this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right * (see _bt_first). */ for (i = 0; i < keysz; i++) { ScanKey entry = &scankey[i]; Datum datum; bool isNull; int32 result; datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (entry->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { result = -1; /* NOT_NULL "<" NULL */ } else { result = DatumGetInt32(FunctionCall2(&entry->sk_func, entry->sk_argument, datum)); } /* if the keys are unequal, return the difference */ if (result != 0) return result; } /* if we get here, the keys are equal */ return 0; }
uint32 check_index_page(Relation rel, PageHeader header, char *buffer, int block) { uint32 nerrs = 0; BTPageOpaque opaque = NULL; /* check basic page header */ nerrs += check_page_header(header, block); /* (block==0) means it's a meta-page, otherwise it's a regular index-page */ if (block == BTREE_METAPAGE) { BTMetaPageData * mpdata = BTPageGetMeta(buffer); ereport(DEBUG2, (errmsg("[%d] is a meta-page [magic=%d, version=%d]", block, mpdata->btm_magic, mpdata->btm_version))); if (mpdata->btm_magic != BTREE_MAGIC) { ereport(WARNING,(errmsg("[%d] metapage contains invalid magic number %d (should be %d)", block, mpdata->btm_magic, BTREE_MAGIC))); nerrs++; } if (mpdata->btm_version != BTREE_VERSION) { ereport(WARNING,(errmsg("[%d] metapage contains invalid version %d (should be %d)", block, mpdata->btm_version, BTREE_VERSION))); nerrs++; } /* FIXME Check that the btm_root/btm_fastroot is between 1 and number of index blocks */ /* FIXME Check that the btm_level/btm_fastlevel is equal to the level fo the root block */ } else { opaque = (BTPageOpaque)(buffer + header->pd_special); /* check there's enough space for index-relevant data */ if (header->pd_special > BLCKSZ - sizeof(BTPageOpaque)) { ereport(WARNING, (errmsg("[%d] there's not enough special space for index data (%d > %d)", block, (int) sizeof(BTPageOpaque), BLCKSZ - header->pd_special))); nerrs++; } /* * if the page is a leaf page, then level needs to be 0. Otherwise, * it should be > 0. Deleted pages don't have a level, the level * field is interleaved with an xid. */ if (!P_ISDELETED(opaque)) { if (P_ISLEAF(opaque)) { if (opaque-> btpo.level != 0) { ereport(WARNING, (errmsg("[%d] is leaf page, but level %d is not zero", block, opaque->btpo.level))); nerrs++; } } else { if (opaque-> btpo.level == 0) { ereport(WARNING, (errmsg("[%d] is a non-leaf page, but level is zero", block))); nerrs++; } } } } return nerrs; }
/* * For a newly inserted heap tid, check if an entry with this tid * already exists in a unique index. If it does, abort the inserting * transaction. */ static void _bt_validate_tid(Relation irel, ItemPointer h_tid) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer buf; Page page; BTPageOpaque opaque; IndexTuple itup; OffsetNumber maxoff, minoff, offnum; elog(DEBUG1, "validating tid (%d,%d) for index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { buf = ReadBuffer(irel, blkno); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(irel, buf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&itup->t_tid, h_tid)) { Form_pg_attribute key_att = RelationGetDescr(irel)->attrs[0]; Oid key = InvalidOid; bool isnull; if (key_att->atttypid == OIDOID) { key = DatumGetInt32( index_getattr(itup, 1, RelationGetDescr(irel), &isnull)); elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), NameStr(key_att->attname), key, RelationGetRelationName(irel)); } else { elog(ERROR, "found tid (%d,%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); } } } } ReleaseBuffer(buf); } MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). In * particular, we don't want to risk adding it to freePages twice. */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems() call * per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { _bt_delitems(rel, buf, deletable, ndeletable, true); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems takes care * of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; /* * During VACUUM FULL it's okay to recycle deleted pages immediately, * since there can be no other transactions scanning the index. Note * that we will only recycle the current page and not any parent pages * that _bt_pagedel might have recursed to; this seems reasonable in * the name of simplicity. (Trying to do otherwise would mean we'd * have to sort the list of recyclable pages we're building.) */ if (ndel && info->vacuum_full) { if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * keysz: number of key conditions to be checked (might be less than the * number of index columns!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */ int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal page * --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); /* * The scan key is set up with the attribute number associated with each * term in the key. It is important that, if the index is multi-key, the * scan contain the first k key attributes, and that they be in order. If * you think about how multi-key ordering works, you'll understand why * this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right (see * _bt_first). */ for (i = 1; i <= keysz; i++) { Datum datum; bool isNull; int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ else result = -1; /* NOT_NULL "<" NULL */ } else { /* * The sk_func needs to be passed the index value as left arg and * the sk_argument as right arg (they might be of different * types). Since it is convenient for callers to think of * _bt_compare as comparing the scankey to the index item, we have * to flip the sign of the comparison result. (Unless it's a DESC * column, in which case we *don't* flip the sign.) */ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, scankey->sk_collation, datum, scankey->sk_argument)); if (!(scankey->sk_flags & SK_BT_DESC)) result = -result; } /* if the keys are unequal, return the difference */ if (result != 0) return result; scankey++; } /* if we get here, the keys are equal */ return 0; }
/* ------------------------------------------------- * GetBTPageStatistics() * * Collect statistics of single b-tree page * ------------------------------------------------- */ static void GetBTPageStatistics(block_t blkno, buf_id_t buffer, BTPageStat *stat) { page_p page = BUF_PAGE(buffer); struct page_hdr * phdr = (struct page_hdr *) page; item_id_t maxoff = PAGE_MAX_ITEM_ID(page); struct bt_page_opaque * opaque = (struct bt_page_opaque *) PAGE_SPECIAL_PTR(page); int item_size = 0; int off; stat->blkno = blkno; stat->max_avail = BLK_SZ - (BLK_SZ - phdr->pd_special + PAGE_HDR_SZ); stat->dead_items = stat->live_items = 0; stat->page_size = PAGE_SZ(page); /* page type (flags) */ if (P_ISDELETED(opaque)) { stat->type = 'd'; stat->btpo.xact = opaque->btpo.xact; return; } else if (P_IGNORE(opaque)) stat->type = 'e'; else if (P_ISLEAF(opaque)) stat->type = 'l'; else if (P_ISROOT(opaque)) stat->type = 'r'; else stat->type = 'i'; /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; stat->btpo.level = opaque->btpo.level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; /* count live and dead tuples, and free space */ for (off = FIRST_ITEM_ID; off <= maxoff; off++) { struct index_tuple * itup; struct item_id * id = PAGE_ITEM_ID(page, off); itup = (struct index_tuple *) PAGE_GET_ITEM(page, id); item_size += INDEX_TUPLE_SZ(itup); if (!ITEMID_DEAD(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = page_free_space(page); if ((stat->live_items + stat->dead_items) > 0) stat->avg_item_size = item_size / (stat->live_items + stat->dead_items); else stat->avg_item_size = 0; }
/* ------------------------------------------------- * GetBTPageStatistics() * * Collect statistics of single b-tree page * ------------------------------------------------- */ static void GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) { Page page = BufferGetPage(buffer); PageHeader phdr = (PageHeader) page; OffsetNumber maxoff = PageGetMaxOffsetNumber(page); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); int item_size = 0; int off; stat->blkno = blkno; stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData); stat->dead_items = stat->live_items = 0; stat->page_size = PageGetPageSize(page); /* page type (flags) */ if (P_ISDELETED(opaque)) { stat->type = 'd'; stat->btpo.xact = opaque->btpo.xact; return; } else if (P_IGNORE(opaque)) stat->type = 'e'; else if (P_ISLEAF(opaque)) stat->type = 'l'; else if (P_ISROOT(opaque)) stat->type = 'r'; else stat->type = 'i'; /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; stat->btpo.level = opaque->btpo.level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; /* count live and dead tuples, and free space */ for (off = FirstOffsetNumber; off <= maxoff; off++) { IndexTuple itup; ItemId id = PageGetItemId(page, off); itup = (IndexTuple) PageGetItem(page, id); item_size += IndexTupleSize(itup); if (!ItemIdIsDead(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = PageGetFreeSpace(page); if ((stat->live_items + stat->dead_items) > 0) stat->avg_item_size = item_size / (stat->live_items + stat->dead_items); else stat->avg_item_size = 0; }
Datum readindex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; readindexinfo *info; Relation irel = NULL; Relation hrel = NULL; MIRROREDLOCK_BUFMGR_DECLARE; if (SRF_IS_FIRSTCALL()) { Oid irelid = PG_GETARG_OID(0); TupleDesc tupdesc; MemoryContext oldcontext; AttrNumber outattnum; TupleDesc itupdesc; int i; AttrNumber attno; irel = index_open(irelid, AccessShareLock); itupdesc = RelationGetDescr(irel); outattnum = FIXED_COLUMN + itupdesc->natts; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(outattnum, false); attno = 1; TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0); for (i = 0; i < itupdesc->natts; i++) { Form_pg_attribute attr = itupdesc->attrs[i]; TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0); } funcctx->tuple_desc = BlessTupleDesc(tupdesc); info = (readindexinfo *) palloc(sizeof(readindexinfo)); funcctx->user_fctx = (void *) info; info->outattnum = outattnum; info->ireloid = irelid; hrel = relation_open(irel->rd_index->indrelid, AccessShareLock); if (hrel->rd_rel != NULL && (hrel->rd_rel->relstorage == 'a' || hrel->rd_rel->relstorage == 'c')) { relation_close(hrel, AccessShareLock); hrel = NULL; info->hreloid = InvalidOid; } else info->hreloid = irel->rd_index->indrelid; info->num_pages = RelationGetNumberOfBlocks(irel); info->blkno = BTREE_METAPAGE + 1; info->page = NULL; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); info = (readindexinfo *) funcctx->user_fctx; /* * Open the relations (on first call, we did that above already). * We unfortunately have to look up the relcache entry on every call, * because if we store it in the cross-call context, we won't get a * chance to release it if the function isn't run to completion, * e.g. because of a LIMIT clause. We only lock the relation on the * first call, and keep the lock until completion, however. */ if (!irel) irel = index_open(info->ireloid, NoLock); if (!hrel && info->hreloid != InvalidOid) hrel = heap_open(info->hreloid, NoLock); while (info->blkno < info->num_pages) { Datum values[255]; bool nulls[255]; ItemPointerData itid; HeapTuple tuple; Datum result; if (info->page == NULL) { Buffer buf; /* * Make copy of the page, because we cannot hold a buffer pin * across calls (we wouldn't have a chance to release it, if the * function isn't run to completion.) */ info->page = palloc(BLCKSZ); MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(irel, info->blkno); memcpy(info->page, BufferGetPage(buf), BLCKSZ); ReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page); info->minoff = P_FIRSTDATAKEY(info->opaque); info->maxoff = PageGetMaxOffsetNumber(info->page); info->offnum = info->minoff; } if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff) { pfree(info->page); info->page = NULL; info->blkno++; continue; } MemSet(nulls, false, info->outattnum * sizeof(bool)); ItemPointerSet(&itid, info->blkno, info->offnum); values[0] = ItemPointerGetDatum(&itid); readindextuple(info, irel, hrel, values, nulls); info->offnum = OffsetNumberNext(info->offnum); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); if (hrel != NULL) heap_close(hrel, NoLock); index_close(irel, NoLock); SRF_RETURN_NEXT(funcctx, result); } if (hrel != NULL) heap_close(hrel, AccessShareLock); index_close(irel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* ------------------------------------------------------ * pgstatindex() * * Usage: SELECT * FROM pgstatindex('t1_pkey'); * ------------------------------------------------------ */ Datum pgstatindex(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); Relation rel; RangeVar *relrv; Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBuffer(rel, 0); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.root_pages = 0; indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; /* Read and lock buffer */ buffer = ReadBuffer(rel, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; else if (P_ISROOT(opaque)) indexStat.root_pages++; else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.version); values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.level); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, (indexStat.root_pages + indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j] = palloc(32); snprintf(values[j++], 32, "%u", indexStat.root_blkno); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages); values[j] = palloc(32); if (indexStat.max_avail > 0) snprintf(values[j++], 32, "%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else snprintf(values[j++], 32, "NaN"); values[j] = palloc(32); if (indexStat.leaf_pages > 0) snprintf(values[j++], 32, "%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else snprintf(values[j++], 32, "NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } PG_RETURN_DATUM(result); }
Datum readindex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; readindexinfo *info; MIRROREDLOCK_BUFMGR_DECLARE; if (SRF_IS_FIRSTCALL()) { Oid irelid = PG_GETARG_OID(0); TupleDesc tupdesc; MemoryContext oldcontext; AttrNumber outattnum; Relation irel; TupleDesc itupdesc; int i; AttrNumber attno; irel = index_open(irelid, AccessShareLock); itupdesc = RelationGetDescr(irel); outattnum = FIXED_COLUMN + itupdesc->natts; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(outattnum, false); attno = 1; TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0); for (i = 0; i < itupdesc->natts; i++) { Form_pg_attribute attr = itupdesc->attrs[i]; TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0); } funcctx->tuple_desc = BlessTupleDesc(tupdesc); info = (readindexinfo *) palloc(sizeof(readindexinfo)); funcctx->user_fctx = (void *) info; info->outattnum = outattnum; info->irel = irel; info->hrel = relation_open(irel->rd_index->indrelid, AccessShareLock); if (info->hrel->rd_rel != NULL && (info->hrel->rd_rel->relstorage == 'a' || info->hrel->rd_rel->relstorage == 'c')) { relation_close(info->hrel, AccessShareLock); info->hrel = NULL; } info->num_pages = RelationGetNumberOfBlocks(irel); info->blkno = BTREE_METAPAGE + 1; info->page = NULL; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); info = (readindexinfo *) funcctx->user_fctx; while (info->blkno < info->num_pages) { Datum values[255]; bool nulls[255]; ItemPointerData itid; HeapTuple tuple; Datum result; if (info->page == NULL) { MIRROREDLOCK_BUFMGR_LOCK; info->buf = ReadBuffer(info->irel, info->blkno); info->page = BufferGetPage(info->buf); info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page); info->minoff = P_FIRSTDATAKEY(info->opaque); info->maxoff = PageGetMaxOffsetNumber(info->page); info->offnum = info->minoff; MIRROREDLOCK_BUFMGR_UNLOCK; } if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff) { ReleaseBuffer(info->buf); info->page = NULL; info->blkno++; continue; } MemSet(nulls, false, info->outattnum * sizeof(bool)); ItemPointerSet(&itid, info->blkno, info->offnum); values[0] = ItemPointerGetDatum(&itid); readindextuple(info, values, nulls); info->offnum = OffsetNumberNext(info->offnum); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } if (info->hrel != NULL) relation_close(info->hrel, AccessShareLock); index_close(info->irel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* * _bt_endpoint() -- Find the first or last page in the index, and scan * from there to the first key satisfying all the quals. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). Exit conditions are the * same as for _bt_first(). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* * Empty index. Lock the whole relation, as nothing finer to lock * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); so->currPos.buf = InvalidBuffer; return false; } PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } /* remember which buffer we have pinned */ so->currPos.buf = buf; /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; } else { so->currPos.moreLeft = true; so->currPos.moreRight = false; } so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ /* * Now load data from the first page of the scan. */ if (!_bt_readpage(scan, dir, start)) { /* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ if (!_bt_steppage(scan, dir)) return false; } /* Drop the lock, but not pin, on the current page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/** * @brief Read the left-most leaf page by walking down on index tree structure * from root node. * * Process flow * -# Open index file and read meta page * -# Get block number of root page * -# Read "fast root" page * -# Read left child page until reaching left-most leaf page * * After calling this function, the members of BTReader are the following: * - smgr : Smgr relation of the existing index file. * - blkno : block number of left-most leaf page. If there is no leaf page, * InvalidBlockNumber is set. * - offnum : InvalidOffsetNumber is set. * - page : Left-most leaf page, or undefined if no leaf page. * * @param reader [in/out] B-Tree index reader * @return true iff there are some tuples */ static bool BTReaderInit(BTReader *reader, Relation rel) { BTPageOpaque metaopaque; BTMetaPageData *metad; BTPageOpaque opaque; BlockNumber blkno; /* * HACK: We cannot use smgropen because smgrs returned from it * will be closed automatically when we assign a new file node. * * XXX: It might be better to open the previous relfilenode with * smgropen *after* RelationSetNewRelfilenode. */ memset(&reader->smgr, 0, sizeof(reader->smgr)); #if PG_VERSION_NUM >= 90100 reader->smgr.smgr_rnode.node = rel->rd_node; reader->smgr.smgr_rnode.backend = rel->rd_backend == MyBackendId ? MyBackendId : InvalidBackendId; #else reader->smgr.smgr_rnode = rel->rd_node; #endif reader->smgr.smgr_which = 0; /* md.c */ reader->blkno = InvalidBlockNumber; reader->offnum = InvalidOffsetNumber; reader->page = palloc(BLCKSZ); /* * Read meta page and check sanity of it. * * XXX: It might be better to do REINDEX against corrupted indexes * instead of raising errors because we've spent long time for data * loading... */ BTReaderReadPage(reader, BTREE_METAPAGE); metaopaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); metad = BTPageGetMeta(reader->page); if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a reader", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d," " code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); if (metad->btm_root == P_NONE) { /* No root page; We ignore the index in the subsequent build. */ reader->blkno = InvalidBlockNumber; return false; } /* Go to the fast root page. */ blkno = metad->btm_fastroot; BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); /* Walk down to the left-most leaf page */ while (!P_ISLEAF(opaque)) { ItemId firstid; IndexTuple itup; /* Get the block number of the left child */ firstid = PageGetItemId(reader->page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(reader->page, firstid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); /* Go down to children */ for (;;) { BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); if (!P_IGNORE(opaque)) break; if (P_RIGHTMOST(opaque)) { /* We reach end of the index without any valid leaves. */ reader->blkno = InvalidBlockNumber; return false; } blkno = opaque->btpo_next; } } return true; }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true. (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.) This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey) { Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result, cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. Note * this covers two cases: the page is really empty (no keys), or it * contains only a high key. The latter case is possible after vacuuming. * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key, or first * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ cmpval = nextkey ? 0 : 1; /* select comparison value */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result >= cmpval) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (resp. > * scan key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key (resp. <= scan key). * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) { Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; CHECK_FOR_INTERRUPTS(); /* Read and lock buffer */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; /* this is the "half dead" state */ else if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j++] = psprintf("%d", indexStat.version); values[j++] = psprintf("%d", indexStat.level); values[j++] = psprintf(INT64_FORMAT, (1 + /* include the metapage in index_size */ indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j++] = psprintf("%u", indexStat.root_blkno); values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages); if (indexStat.max_avail > 0) values[j++] = psprintf("%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else values[j++] = pstrdup("NaN"); if (indexStat.leaf_pages > 0) values[j++] = psprintf("%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else values[j++] = pstrdup("NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } return result; }
/* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode, * any incomplete splits encountered during the search will be finished. */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access) { BTStack stack_in = NULL; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) return (BTStack) NULL; /* Loop iterates once per level descended in the tree */ for (;;) { Page page; BTPageOpaque opaque; OffsetNumber offnum; ItemId itemid; IndexTuple itup; BlockNumber blkno; BlockNumber par_blkno; BTStack new_stack; /* * Race -- the page we just grabbed may have split since we read its * pointer in the parent (or metapage). If it has, we may need to * move right to its new sibling. Do that. * * In write-mode, allow _bt_moveright to finish any incomplete splits * along the way. Strictly speaking, we'd only need to finish an * incomplete split on the leaf page we're about to insert to, not on * any of the upper levels (they is taken care of in _bt_getstackbuf, * if the leaf page is split and we insert to the parent page). But * this is a good opportunity to finish splits of internal pages too. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, (access == BT_WRITE), stack_in, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; /* * Find the appropriate item on the internal page, and get the child * page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); par_blkno = BufferGetBlockNumber(*bufP); /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use the * stack to work back up to the parent page. We also save the actual * downlink (TID) to uniquely identify the index entry, in case it * moves right while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use the * child link to disambiguate duplicate keys in the index -- Lehman * and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData)); new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ); /* okay, all set to move down a level */ stack_in = new_stack; } return stack_in; }
/* * _bt_endpoint() -- Find the first or last key in the index. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel; Buffer buf; Page page; BTPageOpaque opaque; ItemPointer current; OffsetNumber maxoff; OffsetNumber start; BlockNumber blkno; BTItem btitem; IndexTuple itup; BTScanOpaque so; bool res; bool continuescan; rel = scan->indexRelation; current = &(scan->currentItemData); so = (BTScanOpaque) scan->opaque; /* * Scan down to the leftmost or rightmost leaf page. This is a * simplified version of _bt_search(). We don't maintain a stack * since we know we won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* empty index... */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty * page */ start = P_FIRSTDATAKEY(opaque); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } ItemPointerSet(current, blkno, start); /* remember which buffer we have pinned */ so->btso_curbuf = buf; /* * Left/rightmost page could be empty due to deletions, if so step * till we find a nonempty page. */ if (start > maxoff) { if (!_bt_step(scan, &buf, dir)) return false; start = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); itup = &(btitem->bti_itup); /* see if we picked a winner */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } return res; }
/* * Post vacuum, iterate over all entries in index, check if the h_tid * of each entry exists and is not dead. For specific system tables, * also ensure that the key in index entry matches the corresponding * attribute in the heap tuple. */ void _bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer ibuf = InvalidBuffer; Buffer hbuf = InvalidBuffer; Page ipage; BTPageOpaque opaque; IndexTuple itup; HeapTupleData htup; OffsetNumber maxoff, minoff, offnum; Oid ioid, hoid; bool isnull; blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); elog(LOG, "btvalidatevacuum: index %s, heap %s", RelationGetRelationName(irel), RelationGetRelationName(hrel)); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { ibuf = ReadBuffer(irel, blkno); ipage = BufferGetPage(ibuf); opaque = (BTPageOpaque) PageGetSpecialPointer(ipage); if (!PageIsNew(ipage)) _bt_checkpage(irel, ibuf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(ipage); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(ipage, PageGetItemId(ipage, offnum)); ItemPointerCopy(&itup->t_tid, &htup.t_self); /* * TODO: construct a tid bitmap based on index tids * and fetch heap tids in order afterwards. That will * also allow validating if a heap tid appears twice * in a unique index. */ if (!heap_release_fetch(hrel, SnapshotAny, &htup, &hbuf, true, NULL)) { elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s " "not found in heap %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel), RelationGetRelationName(hrel)); } switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf)) { case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: /* these tuples are considered alive by vacuum */ break; case HEAPTUPLE_DEAD: elog(ERROR, "btvalidatevacuum: vacuum did not remove " "dead tuple (%d,%d) from heap %s and index %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(hrel), RelationGetRelationName(irel)); break; default: elog(ERROR, "btvalidatevacuum: invalid visibility"); break; } switch(RelationGetRelid(irel)) { case DatabaseOidIndexId: case TypeOidIndexId: case ClassOidIndexId: case ConstraintOidIndexId: hoid = HeapTupleGetOid(&htup); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; case GpRelationNodeOidIndexId: hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull); int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; default: break; } if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE) { int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } } } } if (BufferIsValid(ibuf)) ReleaseBuffer(ibuf); } if (BufferIsValid(hbuf)) ReleaseBuffer(hbuf); MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey. (NOTE: in particular, this means it is possible * to return a value 1 greater than the number of keys on the page, * if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey. (Since _bt_compare treats the first * data key of such a page as minus infinity, there will be at least one * key < scankey, so the result always points at one of the keys on the * page.) This key indicates the right place to descend to be sure we * find all leaf keys >= given scankey. * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey) { TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. * Note this covers two cases: the page is really empty (no keys), or * it contains only a high key. The latter case is possible after * vacuuming. This can never happen on an internal page, however, * since they are never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key. Loop * invariant: all slots before 'low' are < scan key, all slots at or * after 'high' are >= scan key. We can fall out when high == low. */ high++; /* establish the loop invariant for high */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result > 0) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (which * could be the last slot + 1). */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key. There must be * one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque = NULL; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); if (!PageIsNew(page)) { _bt_checkpage(rel, buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Remember highest leaf page number we've taken cleanup lock on; see * notes in btvacuumscan */ if (blkno > vstate->lastBlockLocked) vstate->lastBlockLocked = blkno; /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes * all information to the replay code to allow it to get a cleanup * lock on all pages between the previous lastBlockVacuumed and * this page. This ensures that WAL replay locks all leaf pages at * some point, which is important should non-MVCC scans be * requested. This is currently unused on standby, but we record * it anyway, so that the WAL contains the required information. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Remember highest leaf page number we've issued a * XLOG_BTREE_VACUUM WAL record for. */ if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; MarkBufferDirtyHint(buf, true); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* checks the individual attributes of the tuple */ uint32 check_index_tuple_attributes(Relation rel, PageHeader header, int block, int i, char *buffer) { IndexTuple tuple; uint32 nerrs = 0; int j, off; bits8 * bitmap; BTPageOpaque opaque; ereport(DEBUG2,(errmsg("[%d:%d] checking attributes for the tuple", block, i))); /* get the index tuple and info about the page */ tuple = (IndexTuple)(buffer + header->pd_linp[i].lp_off); opaque = (BTPageOpaque)(buffer + header->pd_special); /* current attribute offset - always starts at (buffer + off) */ off = header->pd_linp[i].lp_off + IndexInfoFindDataOffset(tuple->t_info); ereport(DEBUG3,(errmsg("[%d:%d] tuple has %d attributes", block, (i+1), RelationGetNumberOfAttributes(rel)))); bitmap = (bits8*)(buffer + header->pd_linp[i].lp_off + sizeof(IndexTupleData)); /* TODO This is mostly copy'n'paste from check_heap_tuple_attributes, so maybe it could be refactored to share the code. */ /* For left-most tuples on non-leaf pages, there are no data actually (see src/backend/access/nbtree/README, last paragraph in section "Notes About Data Representation") Use P_LEFTMOST/P_ISLEAF to identify such cases (for the leftmost item only) and set len = 0. */ if (P_LEFTMOST(opaque) && (! P_ISLEAF(opaque)) && (i == 0)) { ereport(DEBUG3, (errmsg("[%d:%d] leftmost tuple on non-leaf block => no data, skipping", block, i))); return nerrs; } /* check all the index attributes */ for (j = 0; j < rel->rd_att->natts; j++) { /* default length of the attribute */ int len = rel->rd_att->attrs[j]->attlen; /* copy from src/backend/commands/analyze.c */ bool is_varlena = (!rel->rd_att->attrs[j]->attbyval && len == -1); bool is_varwidth = (!rel->rd_att->attrs[j]->attbyval && len < 0); /* thus it's "len = -2" */ /* if the attribute is marked as NULL (in the tuple header), skip to the next attribute */ if (IndexTupleHasNulls(tuple) && att_isnull(j, bitmap)) { ereport(DEBUG3, (errmsg("[%d:%d] attribute '%s' is NULL (skipping)", block, (i+1), rel->rd_att->attrs[j]->attname.data))); continue; } /* fix the alignment (see src/include/access/tupmacs.h) */ off = att_align_pointer(off, rel->rd_att->attrs[j]->attalign, rel->rd_att->attrs[j]->attlen, buffer+off); if (is_varlena) { /* other interesting macros (see postgres.h) - should do something about those ... VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) */ len = VARSIZE_ANY(buffer + off); if (len < 0) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has negative length < 0 (%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, len))); ++nerrs; break; } if (VARATT_IS_COMPRESSED(buffer + off)) { /* the raw length should be less than 1G (and positive) */ if ((VARRAWSIZE_4B_C(buffer + off) < 0) || (VARRAWSIZE_4B_C(buffer + off) > 1024*1024)) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has invalid length %d (should be between 0 and 1G)", block, (i+1), rel->rd_att->attrs[j]->attname.data, VARRAWSIZE_4B_C(buffer + off)))); ++nerrs; /* no break here, this does not break the page structure - we may check the other attributes */ } } /* FIXME Check if the varlena value may be detoasted. */ } else if (is_varwidth) { /* get the C-string length (at most to the end of tuple), +1 as it does not include '\0' at the end */ /* if the string is not properly terminated, then this returns 'remaining space + 1' so it's detected */ len = strnlen(buffer + off, header->pd_linp[i].lp_off + len + header->pd_linp[i].lp_len - off) + 1; } /* Check if the length makes sense (is not negative and does not overflow * the tuple end, stop validating the other rows (we don't know where to * continue anyway). */ if (off + len > (header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' (off=%d len=%d) overflows tuple end (off=%d, len=%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, off, len, header->pd_linp[i].lp_off, header->pd_linp[i].lp_len))); ++nerrs; break; } /* skip to the next attribute */ off += len; ereport(DEBUG3,(errmsg("[%d:%d] attribute '%s' len=%d", block, (i+1), rel->rd_att->attrs[j]->attname.data, len))); } ereport(DEBUG3,(errmsg("[%d:%d] last attribute ends at %d, tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len))); /* after the last attribute, the offset should be exactly the same as the end of the tuple */ if (MAXALIGN(off) != header->pd_linp[i].lp_off + header->pd_linp[i].lp_len) { ereport(WARNING, (errmsg("[%d:%d] the last attribute ends at %d but the tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len))); ++nerrs; } return nerrs; }