/* ------------------------------------------------- * GetHashPageStatistics() * * Collect statistics of single hash page * ------------------------------------------------- */ static void GetHashPageStatistics(Page page, HashPageStat * stat) { OffsetNumber maxoff = PageGetMaxOffsetNumber(page); HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); int off; stat->dead_items = stat->live_items = 0; stat->page_size = PageGetPageSize(page); /* hash page opaque data */ stat->hasho_prevblkno = opaque->hasho_prevblkno; stat->hasho_nextblkno = opaque->hasho_nextblkno; stat->hasho_bucket = opaque->hasho_bucket; stat->hasho_flag = opaque->hasho_flag; stat->hasho_page_id = opaque->hasho_page_id; /* count live and dead tuples, and free space */ for (off = FirstOffsetNumber; off <= maxoff; off++) { ItemId id = PageGetItemId(page, off); if (!ItemIdIsDead(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = PageGetFreeSpace(page); }
static bool entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, GinBtreeEntryInsertData *insertData) { Size releasedsz = 0; Size addedsz; Page page = BufferGetPage(buf); Assert(insertData->entry); Assert(!GinPageIsData(page)); if (insertData->isDelete) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData); if (PageGetFreeSpace(page) + releasedsz >= addedsz) return true; return false; }
/* * Release resources associated with a BrinBuildState. */ static void terminate_brin_buildstate(BrinBuildState *state) { /* * Release the last index buffer used. We might as well ensure that * whatever free space remains in that page is available in FSM, too. */ if (!BufferIsInvalid(state->bs_currentInsertBuf)) { Page page; Size freespace; BlockNumber blk; page = BufferGetPage(state->bs_currentInsertBuf); freespace = PageGetFreeSpace(page); blk = BufferGetBlockNumber(state->bs_currentInsertBuf); ReleaseBuffer(state->bs_currentInsertBuf); RecordPageWithFreeSpace(state->bs_irel, blk, freespace); FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); } brin_free_desc(state->bs_bdesc); pfree(state->bs_dtuple); pfree(state); }
/* * Return the amount of free space on a regular BRIN index page. * * If the page is not a regular page, or has been marked with the * BRIN_EVACUATE_PAGE flag, returns 0. */ static Size br_page_get_freespace(Page page) { if (!BRIN_IS_REGULAR_PAGE(page) || (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) return 0; else return PageGetFreeSpace(page); }
/* * Check space for itup vector on page */ static int gistnospace(Page page, IndexTuple *itvec, int len) { unsigned int size = 0; int i; for (i = 0; i < len; i++) size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); return (PageGetFreeSpace(page) < size); }
/* * Return the amount of free space on a regular BRIN index page. * * If the page is not a regular page, or has been marked with the * BRIN_EVACUATE_PAGE flag, returns 0. */ static Size br_page_get_freespace(Page page) { BrinSpecialSpace *special; special = (BrinSpecialSpace *) PageGetSpecialPointer(page); if (!BRIN_IS_REGULAR_PAGE(page) || (special->flags & BRIN_EVACUATE_PAGE) != 0) return 0; else return PageGetFreeSpace(page); }
/* * PageGetHeapFreeSpace * Returns the size of the free (allocatable) space on a page, * reduced by the space needed for a new line pointer. * * The difference between this and PageGetFreeSpace is that this will return * zero if there are already MaxHeapTuplesPerPage line pointers in the page * and none are free. We use this to enforce that no more than * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although * no more tuples than that could fit anyway, in the presence of redirected * or dead line pointers it'd be possible to have too many line pointers. * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit * on the number of line pointers, we make this extra check.) */ Size PageGetHeapFreeSpace(Page page) { Size space; space = PageGetFreeSpace(page); if (space > 0) { OffsetNumber offnum, nline; /* * Are there already MaxHeapTuplesPerPage line pointers in the page? */ nline = PageGetMaxOffsetNumber(page); if (nline >= MaxHeapTuplesPerPage) { if (PageHasFreeLinePointers((PageHeader) page)) { /* * Since this is just a hint, we must confirm that there is * indeed a free line pointer */ for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { ItemId lp = PageGetItemId(page, offnum); if (!ItemIdIsUsed(lp)) break; } if (offnum > nline) { /* * The hint is wrong, but we can't clear it here since we * don't have the ability to mark the page dirty. */ space = 0; } } else { /* * Although the hint might be wrong, PageAddItem will believe * it anyway, so we must believe it too. */ space = 0; } } } return space; }
/* * lazy_vacuum_heap() -- second pass over the heap * * This routine marks dead tuples as unused and compacts out free * space on their pages. Pages not having dead tuples recorded from * lazy_scan_heap are not visited at all. * * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. */ static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { MIRROREDLOCK_BUFMGR_DECLARE; int tupindex; int npages; PGRUsage ru0; pg_rusage_init(&ru0); npages = 0; tupindex = 0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); while (tupindex < vacrelstats->num_dead_tuples) { BlockNumber tblk; Buffer buf; Page page; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(onerel, tblk); LockBufferForCleanup(buf); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); lazy_record_free_space(vacrelstats, tblk, PageGetFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ npages++; } ereport(elevel, (errmsg("\"%s\": removed %d row versions in %d pages", RelationGetRelationName(onerel), tupindex, npages), errdetail("%s.", pg_rusage_show(&ru0)))); }
/* * Release resources associated with a BrinBuildState. */ static void terminate_brin_buildstate(BrinBuildState *state) { /* release the last index buffer used */ if (!BufferIsInvalid(state->bs_currentInsertBuf)) { Page page; page = BufferGetPage(state->bs_currentInsertBuf); RecordPageWithFreeSpace(state->bs_irel, BufferGetBlockNumber(state->bs_currentInsertBuf), PageGetFreeSpace(page)); ReleaseBuffer(state->bs_currentInsertBuf); } brin_free_desc(state->bs_bdesc); pfree(state->bs_dtuple); pfree(state); }
/* * Check space for itup vector on page */ bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace) { unsigned int size = freespace, deleted = 0; int i; for (i = 0; i < len; i++) size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); if (todelete != InvalidOffsetNumber) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete)); deleted = IndexTupleSize(itup) + sizeof(ItemIdData); } return (PageGetFreeSpace(page) + deleted < size); }
static void gist_dumptree(Relation r, int level, BlockNumber blk, OffsetNumber coff) { Buffer buffer; Page page; GISTPageOpaque opaque; IndexTuple which; ItemId iid; OffsetNumber i, maxoff; BlockNumber cblk; char *pred; pred = (char *) palloc(sizeof(char) * level + 1); MemSet(pred, '\t', level); pred[level] = '\0'; buffer = ReadBuffer(r, blk); page = (Page) BufferGetPage(buffer); opaque = (GISTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); elog(DEBUG4, "%sPage: %d %s blk: %d maxoff: %d free: %d", pred, coff, (opaque->flags & F_LEAF) ? "LEAF" : "INTE", (int) blk, (int) maxoff, PageGetFreeSpace(page)); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); which = (IndexTuple) PageGetItem(page, iid); cblk = ItemPointerGetBlockNumber(&(which->t_tid)); #ifdef PRINTTUPLE elog(DEBUG4, "%s Tuple. blk: %d size: %d", pred, (int) cblk, IndexTupleSize(which)); #endif if (!(opaque->flags & F_LEAF)) gist_dumptree(r, level + 1, cblk, i); } ReleaseBuffer(buffer); pfree(pred); }
static bool entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off) { Size itupsz = 0; Page page = BufferGetPage(buf); Assert(btree->entry); Assert(!GinPageIsData(page)); if (btree->isDelete) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData)) return true; return false; }
/* * pgstat_index_page -- for generic index page */ static void pgstat_index_page(pgstattuple_type *stat, Page page, OffsetNumber minoff, OffsetNumber maxoff) { OffsetNumber i; stat->free_space += PageGetFreeSpace(page); for (i = minoff; i <= maxoff; i = OffsetNumberNext(i)) { ItemId itemid = PageGetItemId(page, i); if (ItemIdIsDead(itemid)) { stat->dead_tuple_count++; stat->dead_tuple_len += ItemIdGetLength(itemid); } else { stat->tuple_count++; stat->tuple_len += ItemIdGetLength(itemid); } } }
/* * pgstattuple_real * * The real work occurs here */ static Datum pgstattuple_real(Relation rel, FunctionCallInfo fcinfo) { HeapScanDesc scan; HeapTuple tuple; BlockNumber nblocks; BlockNumber block = 0; /* next block to count free space in */ BlockNumber tupblock; Buffer buffer; uint64 table_len; uint64 tuple_len = 0; uint64 dead_tuple_len = 0; uint64 tuple_count = 0; uint64 dead_tuple_count = 0; double tuple_percent; double dead_tuple_percent; uint64 free_space = 0; /* free/reusable space in bytes */ double free_percent; /* free/reusable space in % */ TupleDesc tupdesc; AttInMetadata *attinmeta; char **values; int i; Datum result; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* make sure we have a persistent copy of the tupdesc */ tupdesc = CreateTupleDescCopy(tupdesc); /* * Generate attribute metadata needed later to produce tuples from raw C * strings */ attinmeta = TupleDescGetAttInMetadata(tupdesc); scan = heap_beginscan(rel, SnapshotAny, 0, NULL); nblocks = scan->rs_nblocks; /* # blocks to be scanned */ /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { /* must hold a buffer lock to call HeapTupleSatisfiesNow */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); if (HeapTupleSatisfiesNow(tuple->t_data, scan->rs_cbuf)) { tuple_len += tuple->t_len; tuple_count++; } else { dead_tuple_len += tuple->t_len; dead_tuple_count++; } LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* * To avoid physically reading the table twice, try to do the * free-space scan in parallel with the heap scan. However, * heap_getnext may find no tuples on a given page, so we cannot * simply examine the pages returned by the heap scan. */ tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid); while (block <= tupblock) { buffer = ReadBuffer(rel, block); LockBuffer(buffer, BUFFER_LOCK_SHARE); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); block++; } } heap_endscan(scan); while (block < nblocks) { buffer = ReadBuffer(rel, block); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); ReleaseBuffer(buffer); block++; } heap_close(rel, AccessShareLock); table_len = (uint64) nblocks *BLCKSZ; if (nblocks == 0) { tuple_percent = 0.0; dead_tuple_percent = 0.0; free_percent = 0.0; } else { tuple_percent = (double) tuple_len *100.0 / table_len; dead_tuple_percent = (double) dead_tuple_len *100.0 / table_len; free_percent = (double) free_space *100.0 / table_len; } /* * Prepare a values array for constructing the tuple. This should be an * array of C strings which will be processed later by the appropriate * "in" functions. */ values = (char **) palloc(NCOLUMNS * sizeof(char *)); for (i = 0; i < NCOLUMNS; i++) values[i] = (char *) palloc(NCHARS * sizeof(char)); i = 0; snprintf(values[i++], NCHARS, INT64_FORMAT, table_len); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_len); snprintf(values[i++], NCHARS, "%.2f", tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_len); snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, free_space); snprintf(values[i++], NCHARS, "%.2f", free_percent); /* build a tuple */ tuple = BuildTupleFromCStrings(attinmeta, values); /* make the tuple into a datum */ result = HeapTupleGetDatum(tuple); /* Clean up */ for (i = 0; i < NCOLUMNS; i++) pfree(values[i]); pfree(values); return (result); }
/* ------------------------------------------------- * GetBTPageStatistics() * * Collect statistics of single b-tree page * ------------------------------------------------- */ static void GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) { Page page = BufferGetPage(buffer); PageHeader phdr = (PageHeader) page; OffsetNumber maxoff = PageGetMaxOffsetNumber(page); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); int item_size = 0; int off; stat->blkno = blkno; stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData); stat->dead_items = stat->live_items = 0; stat->page_size = PageGetPageSize(page); /* page type (flags) */ if (P_ISDELETED(opaque)) { stat->type = 'd'; stat->btpo.xact = opaque->btpo.xact; return; } else if (P_IGNORE(opaque)) stat->type = 'e'; else if (P_ISLEAF(opaque)) stat->type = 'l'; else if (P_ISROOT(opaque)) stat->type = 'r'; else stat->type = 'i'; /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; stat->btpo.level = opaque->btpo.level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; /* count live and dead tuples, and free space */ for (off = FirstOffsetNumber; off <= maxoff; off++) { IndexTuple itup; ItemId id = PageGetItemId(page, off); itup = (IndexTuple) PageGetItem(page, id); item_size += IndexTupleSize(itup); if (!ItemIdIsDead(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = PageGetFreeSpace(page); if ((stat->live_items + stat->dead_items) > 0) stat->avg_item_size = item_size / (stat->live_items + stat->dead_items); else stat->avg_item_size = 0; }
/* * _hash_doinsert() -- Handle insertion of a single HashItem in the table. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, hashitem is completely filled in. * The datum to be used as a "key" is in the hashitem. */ InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem) { Buffer buf; Buffer metabuf; HashMetaPage metap; IndexTuple itup; BlockNumber itup_blkno; OffsetNumber itup_off; InsertIndexResult res; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; Datum datum; bool isnull; /* * Compute the hash key for the item. We do this first so as not to * need to hold any locks while running the hash function. */ itup = &(hitem->hash_itup); if (rel->rd_rel->relnatts != 1) elog(ERROR, "hash indexes support only one index key"); datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull); Assert(!isnull); hashkey = _hash_datum2hashkey(rel, datum); /* compute item size too */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but * we need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket * safely (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage(rel, (Page) metap, LH_META_PAGE); /* * Check whether the item can fit on a hash page at all. (Eventually, * we ought to try to apply TOAST methods if not.) Note that at this * point, itemsz doesn't include the ItemId. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE); page = BufferGetPage(buf); _hash_checkpage(rel, page, LH_BUCKET_PAGE); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, * we'll find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } _hash_checkpage(rel, page, LH_OVERFLOW_PAGE); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem); itup_blkno = BufferGetBlockNumber(buf); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. * After incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); /* Create the return data structure */ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); return res; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
/* * RelationGetBufferForTuple * * Returns pinned and exclusive-locked buffer of a page in given relation * with free space >= given len. * * If otherBuffer is not InvalidBuffer, then it references a previously * pinned buffer of another page in the same relation; on return, this * buffer will also be exclusive-locked. (This case is used by heap_update; * the otherBuffer contains the tuple being updated.) * * The reason for passing otherBuffer is that if two backends are doing * concurrent heap_update operations, a deadlock could occur if they try * to lock the same two buffers in opposite orders. To ensure that this * can't happen, we impose the rule that buffers of a relation must be * locked in increasing page number order. This is most conveniently done * by having RelationGetBufferForTuple lock them both, with suitable care * for ordering. * * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the * same buffer we select for insertion of the new tuple (this could only * happen if space is freed in that page after heap_update finds there's not * enough there). In that case, the page will be pinned and locked only once. * * If use_fsm is true (the normal case), we use FSM to help us find free * space. If use_fsm is false, we always append a new empty page to the * end of the relation if the tuple won't fit on the current target page. * This can save some cycles when we know the relation is new and doesn't * contain useful amounts of free space. * * The use_fsm = false case is also useful for non-WAL-logged additions to a * relation, if the caller holds exclusive lock and is careful to invalidate * relation->rd_targblock before the first insertion --- that ensures that * all insertions will occur into newly added pages and not be intermixed * with tuples from other transactions. That way, a crash can't risk losing * any committed data of other transactions. (See heap_insert's comments * for additional constraints needed for safe usage of this behavior.) * * We always try to avoid filling existing pages further than the fillfactor. * This is OK since this routine is not consulted when updating a tuple and * keeping it on the same page, which is the scenario fillfactor is meant * to reserve space for. * * ereport(ERROR) is allowed here, so this routine *must* be called * before any (unlogged) changes are made in buffer pool. */ Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, bool use_fsm) { Buffer buffer = InvalidBuffer; Page pageHeader; Size pageFreeSpace, saveFreeSpace; BlockNumber targetBlock, otherBlock; bool needLock; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; len = MAXALIGN(len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %lu, maximum size %lu", (unsigned long) len, (unsigned long) MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); if (otherBuffer != InvalidBuffer) otherBlock = BufferGetBlockNumber(otherBuffer); else otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ /* * We first try to put the tuple on the same page we last inserted a tuple * on, as cached in the relcache entry. If that doesn't work, we ask the * shared Free Space Map to locate a suitable page. Since the FSM's info * might be out of date, we have to be prepared to loop around and retry * multiple times. (To insure this isn't an infinite loop, we must update * the FSM with the correct amount of free space on each page that proves * not to be suitable.) If the FSM has no record of a page with enough * free space, we give up and extend the relation. * * When use_fsm is false, we either put the tuple onto the existing target * page or extend the relation. */ if (len + saveFreeSpace <= MaxHeapTupleSize) targetBlock = relation->rd_targblock; else { /* can't fit, don't screw up FSM request tracking by trying */ targetBlock = InvalidBlockNumber; use_fsm = false; } if (targetBlock == InvalidBlockNumber && use_fsm) { /* * We have no cached target page, so ask the FSM for an initial * target. */ targetBlock = GetPageWithFreeSpace(&relation->rd_node, len + saveFreeSpace); /* * If the FSM knows nothing of the rel, try the last page before we * give up and extend. This avoids one-tuple-per-page syndrome during * bootstrapping or in a recently-started system. */ if (targetBlock == InvalidBlockNumber) { BlockNumber nblocks = RelationGetNumberOfBlocks(relation); if (nblocks > 0) targetBlock = nblocks - 1; } } while (targetBlock != InvalidBlockNumber) { /* * Read and exclusive-lock the target block, as well as the other * block if one was given, taking suitable care with lock ordering and * the possibility they are the same block. */ if (otherBuffer == InvalidBuffer) { /* easy case */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock == targetBlock) { /* also easy case */ buffer = otherBuffer; LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock < targetBlock) { /* lock other buffer first */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else { /* lock target buffer first */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); } /* * Now we can check to see if there's enough free space here. If so, * we're done. */ pageHeader = (Page) BufferGetPage(buffer); pageFreeSpace = PageGetFreeSpace(pageHeader); if (len + saveFreeSpace <= pageFreeSpace) { /* use this page as future insert target, too */ relation->rd_targblock = targetBlock; return buffer; } /* * Not enough space, so we must give up our page locks and pin (if * any) and prepare to look elsewhere. We don't care which order we * unlock the two buffers in, so this can be slightly simpler than the * code above. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); if (otherBuffer == InvalidBuffer) ReleaseBuffer(buffer); else if (otherBlock != targetBlock) { LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } /* Without FSM, always fall out of the loop and extend */ if (!use_fsm) break; /* * Update FSM as to condition of this page, and ask for another page * to try. */ targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node, targetBlock, pageFreeSpace, len + saveFreeSpace); } /* * Have to extend the relation. * * We have to use a lock to ensure no one else is extending the rel at the * same time, else we will both try to initialize the same new page. We * can skip locking for new or temp relations, however, since no one else * could be accessing them. */ needLock = !RELATION_IS_LOCAL(relation); if (needLock) LockRelationForExtension(relation, ExclusiveLock); /* * XXX This does an lseek - rather expensive - but at the moment it is the * only way to accurately determine how many blocks are in a relation. Is * it worth keeping an accurate file length in shared memory someplace, * rather than relying on the kernel to do it for us? */ buffer = ReadBuffer(relation, P_NEW); /* * We can be certain that locking the otherBuffer first is OK, since it * must have a lower page number. */ if (otherBuffer != InvalidBuffer) LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); /* * Now acquire lock on the new page. */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Release the file-extension lock; it's now OK for someone else to extend * the relation some more. Note that we cannot release this lock before * we have buffer lock on the new page, or we risk a race condition * against vacuumlazy.c --- see comments therein. */ if (needLock) UnlockRelationForExtension(relation, ExclusiveLock); /* * We need to initialize the empty new page. Double-check that it really * is empty (this should never happen, but if it does we don't want to * risk wiping out valid data). */ pageHeader = (Page) BufferGetPage(buffer); if (!PageIsNew((PageHeader) pageHeader)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); PageInit(pageHeader, BufferGetPageSize(buffer), 0); if (len > PageGetFreeSpace(pageHeader)) { /* We should not get here given the test at the top */ elog(PANIC, "tuple is too big: size %lu", (unsigned long) len); } /* * Remember the new page as our target for future insertions. * * XXX should we enter the new page into the free space map immediately, * or just keep it for this backend's exclusive use in the short run * (until VACUUM sees it)? Seems to depend on whether you expect the * current backend to make more insertions or not, which is probably a * good bet most of the time. So for now, don't add it to FSM yet. */ relation->rd_targblock = BufferGetBlockNumber(buffer); return buffer; }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno) { Buffer wbuf; Buffer rbuf = 0; BlockNumber wblkno; BlockNumber rblkno; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; OffsetNumber woffnum; OffsetNumber roffnum; IndexTuple itup; Size itemsz; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_BUCKET_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * find the last page in the bucket chain by starting at the base bucket * page and working forward. */ ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (ropaque != wopaque) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ roffnum = FirstOffsetNumber; for (;;) { /* this test is needed in case page is empty on entry */ if (roffnum <= PageGetMaxOffsetNumber(rpage)) { itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); _hash_wrtbuf(rel, wbuf); if (rblkno == wblkno) { /* wbuf is already released */ _hash_wrtbuf(rel, rbuf); return; } wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); } /* * we have found room so insert on the "write" page. */ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * delete the tuple from the "read" page. PageIndexTupleDelete * repacks the ItemId array, so 'roffnum' will be "advanced" to * the "next" ItemId. */ PageIndexTupleDelete(rpage, roffnum); } /* * if the "read" page is now empty because of the deletion (or because * it was empty when we got to it), free it. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ if (PageIsEmpty(rpage)) { rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ _hash_wrtbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); roffnum = FirstOffsetNumber; } } /* NOTREACHED */ }
static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) { Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; CHECK_FOR_INTERRUPTS(); /* Read and lock buffer */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; /* this is the "half dead" state */ else if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j++] = psprintf("%d", indexStat.version); values[j++] = psprintf("%d", indexStat.level); values[j++] = psprintf(INT64_FORMAT, (1 + /* include the metapage in index_size */ indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j++] = psprintf("%u", indexStat.root_blkno); values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages); if (indexStat.max_avail > 0) values[j++] = psprintf("%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else values[j++] = pstrdup("NaN"); if (indexStat.leaf_pages > 0) values[j++] = psprintf("%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else values[j++] = pstrdup("NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } return result; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; Buffer nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; /* * It should be okay to simultaneously write-lock pages from each bucket, * since no one else can be trying to acquire buffer lock on pages of * either bucket. */ oblkno = start_oblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; /* * Fetch the item's hash key (conveniently stored in the item) and * determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); /* we don't need nblkno or nopaque within the loop */ } /* * Insert tuple on new page, using _hash_pgaddtup to ensure * correct ordering by hashkey. This is a tad inefficient * since we may have to shuffle itempointers repeatedly. * Possible future improvement: accumulate all the items for * the new page and qsort them before insertion. */ (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* * Done scanning this old page. If we moved any tuples, delete them * from the old page. */ if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); _hash_wrtbuf(rel, obuf); } else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) break; /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno, NULL); }
/* ------------------------------------------------------ * pgstatindex() * * Usage: SELECT * FROM pgstatindex('t1_pkey'); * ------------------------------------------------------ */ Datum pgstatindex(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); Relation rel; RangeVar *relrv; Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBuffer(rel, 0); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.root_pages = 0; indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; /* Read and lock buffer */ buffer = ReadBuffer(rel, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; else if (P_ISROOT(opaque)) indexStat.root_pages++; else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.version); values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.level); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, (indexStat.root_pages + indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j] = palloc(32); snprintf(values[j++], 32, "%u", indexStat.root_blkno); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages); values[j] = palloc(32); if (indexStat.max_avail > 0) snprintf(values[j++], 32, "%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else snprintf(values[j++], 32, "NaN"); values[j] = palloc(32); if (indexStat.leaf_pages > 0) snprintf(values[j++], 32, "%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else snprintf(values[j++], 32, "NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } PG_RETURN_DATUM(result); }
/* * pgstattuple_real * * The real work occurs here */ static Datum pgstattuple_real(Relation rel) { HeapScanDesc scan; HeapTuple tuple; BlockNumber nblocks; BlockNumber block = 0; /* next block to count free space in */ BlockNumber tupblock; Buffer buffer; uint64 table_len; uint64 tuple_len = 0; uint64 dead_tuple_len = 0; uint64 tuple_count = 0; uint64 dead_tuple_count = 0; double tuple_percent; double dead_tuple_percent; uint64 free_space = 0; /* free/reusable space in bytes */ double free_percent; /* free/reusable space in % */ TupleDesc tupdesc; TupleTableSlot *slot; AttInMetadata *attinmeta; char **values; int i; Datum result; /* * Build a tuple description for a pgstattupe_type tuple */ tupdesc = RelationNameGetTupleDesc(DUMMY_TUPLE); /* allocate a slot for a tuple with this tupdesc */ slot = TupleDescGetSlot(tupdesc); /* * Generate attribute metadata needed later to produce tuples from raw * C strings */ attinmeta = TupleDescGetAttInMetadata(tupdesc); nblocks = RelationGetNumberOfBlocks(rel); scan = heap_beginscan(rel, SnapshotAny, 0, NULL); /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { uint16 sv_infomask; sv_infomask = tuple->t_data->t_infomask; if (HeapTupleSatisfiesNow(tuple->t_data)) { tuple_len += tuple->t_len; tuple_count++; } else { dead_tuple_len += tuple->t_len; dead_tuple_count++; } if (sv_infomask != tuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->rs_cbuf); /* * To avoid physically reading the table twice, try to do the * free-space scan in parallel with the heap scan. However, * heap_getnext may find no tuples on a given page, so we cannot * simply examine the pages returned by the heap scan. */ tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid); while (block <= tupblock) { buffer = ReadBuffer(rel, block); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); ReleaseBuffer(buffer); block++; } } heap_endscan(scan); while (block < nblocks) { buffer = ReadBuffer(rel, block); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); ReleaseBuffer(buffer); block++; } heap_close(rel, AccessShareLock); table_len = (uint64) nblocks *BLCKSZ; if (nblocks == 0) { tuple_percent = 0.0; dead_tuple_percent = 0.0; free_percent = 0.0; } else { tuple_percent = (double) tuple_len *100.0 / table_len; dead_tuple_percent = (double) dead_tuple_len *100.0 / table_len; free_percent = (double) free_space *100.0 / table_len; } /* * Prepare a values array for storage in our slot. This should be an * array of C strings which will be processed later by the appropriate * "in" functions. */ values = (char **) palloc(NCOLUMNS * sizeof(char *)); for (i = 0; i < NCOLUMNS; i++) values[i] = (char *) palloc(NCHARS * sizeof(char)); i = 0; snprintf(values[i++], NCHARS, INT64_FORMAT, table_len); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_len); snprintf(values[i++], NCHARS, "%.2f", tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_len); snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, free_space); snprintf(values[i++], NCHARS, "%.2f", free_percent); /* build a tuple */ tuple = BuildTupleFromCStrings(attinmeta, values); /* make the tuple into a datum */ result = TupleGetDatum(slot, tuple); /* Clean up */ for (i = 0; i < NCOLUMNS; i++) pfree(values[i]); pfree(values); return (result); }
/* * _hash_doinsert() -- Handle insertion of a single index tuple. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, itup is completely filled in. */ void _hash_doinsert(Relation rel, IndexTuple itup) { Buffer buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; /* * Get the hash key for the item (it's stored in the index tuple itself). */ hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. * * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll * find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); }
Datum spgstat(PG_FUNCTION_ARGS) { text *name=PG_GETARG_TEXT_P(0); char *relname=text_to_cstring(name); RangeVar *relvar; Relation index; List *relname_list; Oid relOid; BlockNumber blkno = SPGIST_HEAD_BLKNO; BlockNumber totalPages = 0, innerPages = 0, emptyPages = 0; double usedSpace = 0.0; char res[1024]; int bufferSize = -1; int64 innerTuples = 0, leafTuples = 0; relname_list = stringToQualifiedNameList(relname); relvar = makeRangeVarFromNameList(relname_list); relOid = RangeVarGetRelid(relvar, false); index = index_open(relOid, AccessExclusiveLock); if ( index->rd_am == NULL ) elog(ERROR, "Relation %s.%s is not an index", get_namespace_name(RelationGetNamespace(index)), RelationGetRelationName(index) ); totalPages = RelationGetNumberOfBlocks(index); for(blkno=SPGIST_HEAD_BLKNO; blkno<totalPages; blkno++) { Buffer buffer; Page page; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); if (SpGistPageIsLeaf(page)) { leafTuples += SpGistPageGetMaxOffset(page); } else { innerPages++; innerTuples += SpGistPageGetMaxOffset(page); } if (bufferSize < 0) bufferSize = BufferGetPageSize(buffer) - MAXALIGN(sizeof(SpGistPageOpaqueData)) - SizeOfPageHeaderData; usedSpace += bufferSize - (PageGetFreeSpace(page) + sizeof(ItemIdData)); if (PageGetFreeSpace(page) + sizeof(ItemIdData) == bufferSize) emptyPages++; UnlockReleaseBuffer(buffer); } index_close(index, AccessExclusiveLock); totalPages--; /* metapage */ snprintf(res, sizeof(res), "totalPages: %u\n" "innerPages: %u\n" "leafPages: %u\n" "emptyPages: %u\n" "usedSpace: %.2f kbytes\n" "freeSpace: %.2f kbytes\n" "fillRatio: %.2f%c\n" "leafTuples: %lld\n" "innerTuples: %lld", totalPages, innerPages, totalPages - innerPages, emptyPages, usedSpace / 1024.0, (( (double) bufferSize ) * ( (double) totalPages ) - usedSpace) / 1024, 100.0 * ( usedSpace / (( (double) bufferSize ) * ( (double) totalPages )) ), '%', leafTuples, innerTuples ); PG_RETURN_TEXT_P(CStringGetTextDatum(res)); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats, List *all_extra_oids) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) { List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i], onerel, reindex_count); lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats, extra_oids); list_free(extra_oids); } reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(onerel, blkno); /* Initially, we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* LockBufferForCleanup(buf)? */ if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); ItemPointerSet(&(tuple.t_self), blkno, offnum); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf, false)) { case HEAPTUPLE_DEAD: tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it * needs freezing. If we already froze anything, then * we've already switched the buffer lock to exclusive. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, (nfrozen > 0) ? InvalidBuffer : buf)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Trade in buffer share lock for super-exclusive lock */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) { List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i], onerel, reindex_count); lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats, extra_oids); list_free(extra_oids); } reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), relname), errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\"."))); }
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
static int nospace(Page p, IndexTuple it) { return PageGetFreeSpace(p) < IndexTupleSize(it); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { if (ndeletable > 0) { /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); _hash_wrtbuf(rel, rbuf); } else _hash_relbuf(rel, rbuf); return; } wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; } /* * we have found room so insert on the "write" page, being careful * to preserve hashkey ordering. (If we insert many tuples into * the same "write" page it would be worth qsort'ing instead of * doing repeated _hash_pgaddtup.) */ (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); wbuf_dirty = true; /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
static void bitmap_xlog_insert_lovitem(bool redo, XLogRecPtr lsn, XLogRecord* record) { xl_bm_lovitem *xlrec = (xl_bm_lovitem*) XLogRecGetData(record); Relation reln; reln = XLogOpenRelation(xlrec->bm_node); if (!RelationIsValid(reln)) return; if (redo) { Buffer lovBuffer; Page lovPage; #ifdef BM_DEBUG ereport(LOG, (errcode(LOG), errmsg("call bitmap_xlog_insert_lovitem: redo=%d, blkno=%d\n", redo, xlrec->bm_lov_blkno))); #endif lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno); if (!BufferIsValid(lovBuffer)) elog(PANIC, "bm_insert_redo: block unfound: %d", xlrec->bm_lov_blkno); lovPage = BufferGetPage(lovBuffer); if (XLByteLT(PageGetLSN(lovPage), lsn)) { if(xlrec->bm_isNewItem) { OffsetNumber newOffset, itemSize; newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(lovPage)); if (newOffset != xlrec->bm_lov_offset) elog(PANIC, "bm_insert_redo: LOV item is not inserted in pos %d(requested %d)", newOffset, xlrec->bm_lov_offset); itemSize = sizeof(BMLOVItemData); if (itemSize > PageGetFreeSpace(lovPage)) elog(PANIC, "bm_insert_redo: not enough space in LOV page %d", xlrec->bm_lov_blkno); if (PageAddItem(lovPage, (Item)&(xlrec->bm_lovItem), itemSize, newOffset, LP_USED) == InvalidOffsetNumber) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to add LOV item to \"%s\"", RelationGetRelationName(reln)))); } else{ BMLOVItem oldLovItem; oldLovItem = (BMLOVItem) PageGetItem(lovPage, PageGetItemId(lovPage, xlrec->bm_lov_offset)); memcpy(oldLovItem, &(xlrec->bm_lovItem), sizeof(BMLOVItemData)); } PageSetLSN(lovPage, lsn); PageSetTLI(lovPage, ThisTimeLineID); _bitmap_wrtbuf(lovBuffer); } else { _bitmap_relbuf(lovBuffer); } } else elog(PANIC, "bm_insert_undo: not implemented."); }