/* * pgstat_index -- returns live/dead tuples info in a generic index */ static Datum pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn, FunctionCallInfo fcinfo) { BlockNumber nblocks; BlockNumber blkno; BufferAccessStrategy bstrategy; pgstattuple_type stat = {0}; /* prepare access strategy for this index */ bstrategy = GetAccessStrategy(BAS_BULKREAD); blkno = start; for (;;) { /* Get the current relation length */ LockRelationForExtension(rel, ExclusiveLock); nblocks = RelationGetNumberOfBlocks(rel); UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= nblocks) { stat.table_len = (uint64) nblocks *BLCKSZ; break; } for (; blkno < nblocks; blkno++) { CHECK_FOR_INTERRUPTS(); pagefn(&stat, rel, blkno, bstrategy); } } relation_close(rel, AccessShareLock); return build_pgstattuple_type(&stat, fcinfo); }
/* * Build a new bloom index. */ IndexBuildResult * blbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; BloomBuildState buildstate; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* Initialize the meta page */ BloomInitMetapage(index); /* Initialize the bloom build state */ memset(&buildstate, 0, sizeof(buildstate)); initBloomState(&buildstate.blstate, index); buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom build temporary context", ALLOCSET_DEFAULT_SIZES); initCachedPage(&buildstate); /* Do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, bloomBuildCallback, (void *) &buildstate); /* * There are could be some items in cached page. Flush this page if * needed. */ if (buildstate.count > 0) flushCachedPage(index, &buildstate); MemoryContextDelete(buildstate.tmpCtx); result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = result->index_tuples = reltuples; return result; }
Datum pg_relpagesbyid(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); int64 relpages; Relation rel; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); rel = relation_open(relid, AccessShareLock); /* note: this will work OK on non-local temp tables */ relpages = RelationGetNumberOfBlocks(rel); relation_close(rel, AccessShareLock); PG_RETURN_INT64(relpages); }
/* No need for superuser checks in v1.5, see above */ Datum pg_relpages_v1_5(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_PP(0); int64 relpages; Relation rel; RangeVar *relrv; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); /* only some relkinds have storage */ check_relation_relkind(rel); /* note: this will work OK on non-local temp tables */ relpages = RelationGetNumberOfBlocks(rel); relation_close(rel, AccessShareLock); PG_RETURN_INT64(relpages); }
/* * hashbuild() -- build a new hash index. */ Datum hashbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; HashBuildState buildstate; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* initialize the hash index metadata page */ _hash_metapinit(index); /* build the index */ buildstate.indtuples = 0; /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, hashbuildCallback, (void *) &buildstate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
/* -------------------------------------------------------- * pg_relpages() * * Get a number of pages of the table/index. * * Usage: SELECT pg_relpages('t1'); * SELECT pg_relpages('t1_pkey'); * -------------------------------------------------------- */ Datum pg_relpages(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); Relation rel; RangeVar *relrv; int4 relpages; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); relpages = RelationGetNumberOfBlocks(rel); relation_close(rel, AccessShareLock); PG_RETURN_INT32(relpages); }
/* * GetPageWithFreeSpace - try to find a page in the given relation with * at least the specified amount of free space. * * If successful, return the block number; if not, return InvalidBlockNumber. * * The caller must be prepared for the possibility that the returned page * will turn out to have too little space available by the time the caller * gets a lock on it. In that case, the caller should report the actual * amount of free space available on that page and then try again (see * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, * extend the relation. * * For very small heap relations that don't have a FSM, we try every other * page before extending the relation. To keep track of which pages have * been tried, initialize a local in-memory map of pages. */ BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded, bool check_fsm_only) { uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); BlockNumber target_block, nblocks; /* First try the FSM, if it exists. */ target_block = fsm_search(rel, min_cat); if (target_block == InvalidBlockNumber && (rel->rd_rel->relkind == RELKIND_RELATION || rel->rd_rel->relkind == RELKIND_TOASTVALUE) && !check_fsm_only) { nblocks = RelationGetNumberOfBlocks(rel); if (nblocks > HEAP_FSM_CREATION_THRESHOLD) { /* * If the FSM knows nothing of the rel, try the last page before * we give up and extend. This avoids one-tuple-per-page syndrome * during bootstrapping or in a recently-started system. */ target_block = nblocks - 1; } else if (nblocks > 0) { /* Create or update local map and get first candidate block. */ fsm_local_set(rel, nblocks); target_block = fsm_local_search(); } } return target_block; }
/* * Post-VACUUM cleanup. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum bingo_vacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber num_pages = 0; elog(NOTICE, "start test vacuum"); /* * If bulkdelete wasn't called, return NULL signifying no change * Note: this covers the analyze_only case too */ if (stats == NULL) { PG_RETURN_POINTER(NULL); } /* * update statistics */ num_pages = RelationGetNumberOfBlocks(rel); stats->num_pages = num_pages; stats->num_index_tuples = 1; stats->estimated_count = false; PG_RETURN_POINTER(stats); }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ Datum gistbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, gistGetFakeLSN(heap)); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); gistFreeBuildBuffers(buildstate.gfbb); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * hashbuild() -- build a new hash index. */ IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; BlockNumber relpages; double reltuples; double allvisfrac; uint32 num_buckets; long sort_threshold; HashBuildState buildstate; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* Estimate the number of rows currently present in the table */ estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); /* Initialize the hash index metadata page and initial buckets */ num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then * (assuming their hash codes are pretty random) there will be no locality * of access to the index, and if the index is bigger than available RAM * then we'll thrash horribly. To prevent that scenario, we can sort the * tuples by (expected) bucket number. However, such a sort is useless * overhead when the index does fit in RAM. We choose to sort if the * initial index size exceeds maintenance_work_mem, or the number of * buffers usable for the index, whichever is less. (Limiting by the * number of buffers should reduce thrashing between PG buffers and kernel * buffers, which seems useful even if no physical I/O results. Limiting * by maintenance_work_mem is useful to allow easy testing of the sort * code path, and may be useful to DBAs as an additional control knob.) * * NOTE: this test will need adjustment if a bucket is ever different from * one page. Also, "initial index size" accounting does not include the * metapage, nor the first bitmap page. */ sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) sort_threshold = Min(sort_threshold, NBuffers); else sort_threshold = Min(sort_threshold, NLocBuffer); if (num_buckets >= (uint32) sort_threshold) buildstate.spool = _h_spoolinit(heap, index, num_buckets); else buildstate.spool = NULL; /* prepare to build the index */ buildstate.indtuples = 0; buildstate.heapRel = heap; /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, hashbuildCallback, (void *) &buildstate); if (buildstate.spool) { /* sort the tuples and insert them into the index */ _h_indexbuild(buildstate.spool, buildstate.heapRel); _h_spooldestroy(buildstate.spool); } /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) { Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; CHECK_FOR_INTERRUPTS(); /* Read and lock buffer */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; /* this is the "half dead" state */ else if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j++] = psprintf("%d", indexStat.version); values[j++] = psprintf("%d", indexStat.level); values[j++] = psprintf(INT64_FORMAT, (1 + /* include the metapage in index_size */ indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j++] = psprintf("%u", indexStat.root_blkno); values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages); if (indexStat.max_avail > 0) values[j++] = psprintf("%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else values[j++] = pstrdup("NaN"); if (indexStat.leaf_pages > 0) values[j++] = psprintf("%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else values[j++] = pstrdup("NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } return result; }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Allocate per-column info arrays. To save a few palloc cycles * we allocate all the Oid-type arrays in one request. Note that * the opfamily array needs an extra, terminating zero at the end. * We pre-zero the ordering info in case the index is unordered. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->opfamily = (Oid *) palloc0(sizeof(Oid) * (4 * ncolumns + 1)); info->opcintype = info->opfamily + (ncolumns + 1); info->fwdsortop = info->opcintype + ncolumns; info->revsortop = info->fwdsortop + ncolumns; info->nulls_first = (bool *) palloc0(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering operators associated with the index, if any. * We expect that all ordering-capable indexes use btree's * strategy numbers for the ordering operators. */ if (indexRelation->rd_am->amcanorder) { int nstrat = indexRelation->rd_am->amstrategies; for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; int fwdstrat; int revstrat; if (opt & INDOPTION_DESC) { fwdstrat = BTGreaterStrategyNumber; revstrat = BTLessStrategyNumber; } else { fwdstrat = BTLessStrategyNumber; revstrat = BTGreaterStrategyNumber; } /* * Index AM must have a fixed set of strategies for it to * make sense to specify amcanorder, so we need not allow * the case amstrategies == 0. */ if (fwdstrat > 0) { Assert(fwdstrat <= nstrat); info->fwdsortop[i] = indexRelation->rd_operator[i * nstrat + fwdstrat - 1]; } if (revstrat > 0) { Assert(revstrat <= nstrat); info->revsortop[i] = indexRelation->rd_operator[i * nstrat + revstrat - 1]; } info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * lazy_truncate_heap - try to truncate off any empty pages at the end */ static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) { BlockNumber old_rel_pages = vacrelstats->rel_pages; BlockNumber new_rel_pages; PGRUsage ru0; pg_rusage_init(&ru0); /* * We need full exclusive lock on the relation in order to do truncation. * If we can't get it, give up rather than waiting --- we don't want to * block other backends, and we don't want to deadlock (which is quite * possible considering we already hold a lower-grade lock). */ if (!ConditionalLockRelation(onerel, AccessExclusiveLock)) return; /* * Now that we have exclusive lock, look to see if the rel has grown * whilst we were vacuuming with non-exclusive lock. If so, give up; the * newly added pages presumably contain non-deletable tuples. */ new_rel_pages = RelationGetNumberOfBlocks(onerel); if (new_rel_pages != old_rel_pages) { /* * Note: we intentionally don't update vacrelstats->rel_pages with the * new rel size here. If we did, it would amount to assuming that the * new pages are empty, which is unlikely. Leaving the numbers alone * amounts to assuming that the new pages have the same tuple density * as existing ones, which is less unlikely. */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Scan backwards from the end to verify that the end pages actually * contain no tuples. This is *necessary*, not optional, because other * backends could have added tuples to these pages whilst we were * vacuuming. */ new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); if (new_rel_pages >= old_rel_pages) { /* can't do anything after all */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Okay to truncate. */ RelationTruncate(onerel, new_rel_pages); /* * We can release the exclusive lock as soon as we have truncated. Other * backends can't safely access the relation until they have processed the * smgr invalidation that smgrtruncate sent out ... but that should happen * as part of standard invalidation processing once they acquire lock on * the relation. */ UnlockRelation(onerel, AccessExclusiveLock); /* * Update statistics. Here, it *is* correct to adjust rel_pages without * also touching reltuples, since the tuple count wasn't changed by the * truncation. */ vacrelstats->rel_pages = new_rel_pages; vacrelstats->pages_removed = old_rel_pages - new_rel_pages; ereport(elevel, (errmsg("\"%s\": truncated %u to %u pages", RelationGetRelationName(onerel), old_rel_pages, new_rel_pages), errdetail("%s.", pg_rusage_show(&ru0)))); }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE; Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.freePages = NULL; /* temporarily */ vstate.nFreePages = 0; vstate.maxFreePages = 0; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Allocate freePages after we read num_pages the first time */ if (vstate.freePages == NULL) { /* No point in remembering more than MaxFSMPages pages */ vstate.maxFreePages = MaxFSMPages; if ((BlockNumber) vstate.maxFreePages > num_pages) vstate.maxFreePages = (int) num_pages; vstate.freePages = (BlockNumber *) palloc(vstate.maxFreePages * sizeof(BlockNumber)); } /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * During VACUUM FULL, we truncate off any recyclable pages at the end of * the index. In a normal vacuum it'd be unsafe to do this except by * acquiring exclusive lock on the index and then rechecking all the * pages; doesn't seem worth it. */ if (info->vacuum_full && vstate.nFreePages > 0) { BlockNumber new_pages = num_pages; while (vstate.nFreePages > 0 && vstate.freePages[vstate.nFreePages - 1] == new_pages - 1) { new_pages--; stats->pages_deleted--; vstate.nFreePages--; vstate.totFreePages = vstate.nFreePages; /* can't be more */ } if (new_pages != num_pages) { /* * Okay to truncate. */ RelationTruncate(rel, new_pages, /* markPersistentAsPhysicallyTruncated */ true); /* update statistics */ stats->pages_removed += num_pages - new_pages; num_pages = new_pages; } } /* * Update the shared Free Space Map with the info we now have about free * pages in the index, discarding any old info the map may have. We do not * need to sort the page numbers; they're in order already. */ RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages, vstate.nFreePages, vstate.freePages); pfree(vstate.freePages); MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT; }
/* * btbuild() -- build a new btree index. */ IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; BTBuildState buildstate; buildstate.isUnique = indexInfo->ii_Unique; buildstate.haveDead = false; buildstate.heapRel = heap; buildstate.spool = NULL; buildstate.spool2 = NULL; buildstate.indtuples = 0; #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) ResetUsage(); #endif /* BTREE_BUILD_STATS */ /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false); /* * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. */ if (indexInfo->ii_Unique) buildstate.spool2 = _bt_spoolinit(heap, index, false, true); /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, btbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ if (buildstate.spool2 && !buildstate.haveDead) { /* spool2 turns out to be unnecessary */ _bt_spooldestroy(buildstate.spool2); buildstate.spool2 = NULL; } /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. */ _bt_leafbuild(buildstate.spool, buildstate.spool2); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) { ShowUsage("BTREE BUILD STATS"); ResetUsage(); } #endif /* BTREE_BUILD_STATS */ /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * For a newly inserted heap tid, check if an entry with this tid * already exists in a unique index. If it does, abort the inserting * transaction. */ static void _bt_validate_tid(Relation irel, ItemPointer h_tid) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer buf; Page page; BTPageOpaque opaque; IndexTuple itup; OffsetNumber maxoff, minoff, offnum; elog(DEBUG1, "validating tid (%d,%d) for index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { buf = ReadBuffer(irel, blkno); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(irel, buf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&itup->t_tid, h_tid)) { Form_pg_attribute key_att = RelationGetDescr(irel)->attrs[0]; Oid key = InvalidOid; bool isnull; if (key_att->atttypid == OIDOID) { key = DatumGetInt32( index_getattr(itup, 1, RelationGetDescr(irel), &isnull)); elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), NameStr(key_att->attname), key, RelationGetRelationName(irel)); } else { elog(ERROR, "found tid (%d,%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); } } } } ReleaseBuffer(buf); } MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * hashbuild() -- build a new hash index. */ IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; BlockNumber relpages; double reltuples; double allvisfrac; uint32 num_buckets; HashBuildState buildstate; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* Estimate the number of rows currently present in the table */ estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); /* Initialize the hash index metadata page and initial buckets */ num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then * (assuming their hash codes are pretty random) there will be no locality * of access to the index, and if the index is bigger than available RAM * then we'll thrash horribly. To prevent that scenario, we can sort the * tuples by (expected) bucket number. However, such a sort is useless * overhead when the index does fit in RAM. We choose to sort if the * initial index size exceeds NBuffers. * * NOTE: this test will need adjustment if a bucket is ever different from * one page. */ if (num_buckets >= (uint32) NBuffers) buildstate.spool = _h_spoolinit(heap, index, num_buckets); else buildstate.spool = NULL; /* prepare to build the index */ buildstate.indtuples = 0; /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, hashbuildCallback, (void *) &buildstate); if (buildstate.spool) { /* sort the tuples and insert them into the index */ _h_indexbuild(buildstate.spool); _h_spooldestroy(buildstate.spool); } /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; uint64 misc_count = 0; OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } scanned++; /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We count live and dead tuples, but we also need to add up * others in order to feed vac_estimate_reltuples. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_RECENTLY_DEAD: misc_count++; /* Fall through */ case HEAPTUPLE_DEAD: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; case HEAPTUPLE_LIVE: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: misc_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks *BLCKSZ; stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, stat->tuple_count + misc_count); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } }
/* * Insert all matching tuples into to a bitmap. */ int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { int64 ntids = 0; BlockNumber blkno = BLOOM_HEAD_BLKNO, npages; int i; BufferAccessStrategy bas; BloomScanOpaque so = (BloomScanOpaque) scan->opaque; if (so->sign == NULL) { /* New search: have to calculate search signature */ ScanKey skey = scan->keyData; so->sign = palloc0(sizeof(SignType) * so->state.opts.bloomLength); for (i = 0; i < scan->numberOfKeys; i++) { /* * Assume bloom-indexable operators to be strict, so nothing could * be found for NULL key. */ if (skey->sk_flags & SK_ISNULL) { pfree(so->sign); so->sign = NULL; return 0; } /* Add next value to the signature */ signValue(&so->state, so->sign, skey->sk_argument, skey->sk_attno - 1); skey++; } } /* * We're going to read the whole index. This is why we use appropriate * buffer access strategy. */ bas = GetAccessStrategy(BAS_BULKREAD); npages = RelationGetNumberOfBlocks(scan->indexRelation); for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, blkno, RBM_NORMAL, bas); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); if (!BloomPageIsDeleted(page)) { OffsetNumber offset, maxOffset = BloomPageGetMaxOffset(page); for (offset = 1; offset <= maxOffset; offset++) { BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset); bool res = true; /* Check index signature with scan signature */ for (i = 0; i < so->state.opts.bloomLength; i++) { if ((itup->sign[i] & so->sign[i]) != so->sign[i]) { res = false; break; } } /* Add matching tuples to bitmap */ if (res) { tbm_add_tuples(tbm, &itup->heapPtr, 1, true); ntids++; } } } UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } FreeAccessStrategy(bas); return ntids; }
/* * VACUUM cleanup: update FSM */ IndexBulkDeleteResult * gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) return stats; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } UnlockReleaseBuffer(buffer); } /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); return stats; }
/* * Post vacuum, iterate over all entries in index, check if the h_tid * of each entry exists and is not dead. For specific system tables, * also ensure that the key in index entry matches the corresponding * attribute in the heap tuple. */ void _bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer ibuf = InvalidBuffer; Buffer hbuf = InvalidBuffer; Page ipage; BTPageOpaque opaque; IndexTuple itup; HeapTupleData htup; OffsetNumber maxoff, minoff, offnum; Oid ioid, hoid; bool isnull; blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); elog(LOG, "btvalidatevacuum: index %s, heap %s", RelationGetRelationName(irel), RelationGetRelationName(hrel)); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { ibuf = ReadBuffer(irel, blkno); ipage = BufferGetPage(ibuf); opaque = (BTPageOpaque) PageGetSpecialPointer(ipage); if (!PageIsNew(ipage)) _bt_checkpage(irel, ibuf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(ipage); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(ipage, PageGetItemId(ipage, offnum)); ItemPointerCopy(&itup->t_tid, &htup.t_self); /* * TODO: construct a tid bitmap based on index tids * and fetch heap tids in order afterwards. That will * also allow validating if a heap tid appears twice * in a unique index. */ if (!heap_release_fetch(hrel, SnapshotAny, &htup, &hbuf, true, NULL)) { elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s " "not found in heap %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel), RelationGetRelationName(hrel)); } switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf)) { case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: /* these tuples are considered alive by vacuum */ break; case HEAPTUPLE_DEAD: elog(ERROR, "btvalidatevacuum: vacuum did not remove " "dead tuple (%d,%d) from heap %s and index %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(hrel), RelationGetRelationName(irel)); break; default: elog(ERROR, "btvalidatevacuum: invalid visibility"); break; } switch(RelationGetRelid(irel)) { case DatabaseOidIndexId: case TypeOidIndexId: case ClassOidIndexId: case ConstraintOidIndexId: hoid = HeapTupleGetOid(&htup); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; case GpRelationNodeOidIndexId: hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull); int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; default: break; } if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE) { int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } } } } if (BufferIsValid(ibuf)) ReleaseBuffer(ibuf); } if (BufferIsValid(hbuf)) ReleaseBuffer(hbuf); MIRROREDLOCK_BUFMGR_UNLOCK; }
/* ------------------------------------------------------ * pgstathashindex() * * Usage: SELECT * FROM pgstathashindex('hashindex'); * ------------------------------------------------------ */ Datum pgstathashindex(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); BlockNumber nblocks; BlockNumber blkno; Relation rel; HashIndexStat stats; BufferAccessStrategy bstrategy; HeapTuple tuple; TupleDesc tupleDesc; Datum values[8]; bool nulls[8]; Buffer metabuf; HashMetaPage metap; float8 free_percent; uint64 total_space; rel = index_open(relid, AccessShareLock); /* index_open() checks that it's an index */ if (!IS_HASH(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("relation \"%s\" is not a HASH index", RelationGetRelationName(rel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary indexes of other sessions"))); /* Get the information we need from the metapage. */ memset(&stats, 0, sizeof(stats)); metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); stats.version = metap->hashm_version; stats.space_per_page = metap->hashm_bsize; _hash_relbuf(rel, metabuf); /* Get the current relation length */ nblocks = RelationGetNumberOfBlocks(rel); /* prepare access strategy for this index */ bstrategy = GetAccessStrategy(BAS_BULKREAD); /* Start from blkno 1 as 0th block is metapage */ for (blkno = 1; blkno < nblocks; blkno++) { Buffer buf; Page page; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buf); if (PageIsNew(page)) stats.unused_pages++; else if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); else { HashPageOpaque opaque; int pagetype; opaque = (HashPageOpaque) PageGetSpecialPointer(page); pagetype = opaque->hasho_flag & LH_PAGE_TYPE; if (pagetype == LH_BUCKET_PAGE) { stats.bucket_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_OVERFLOW_PAGE) { stats.overflow_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_BITMAP_PAGE) stats.bitmap_pages++; else if (pagetype == LH_UNUSED_PAGE) stats.unused_pages++; else ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("unexpected page type 0x%04X in HASH index \"%s\" block %u", opaque->hasho_flag, RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); } UnlockReleaseBuffer(buf); } /* Done accessing the index */ index_close(rel, AccessShareLock); /* Count unused pages as free space. */ stats.free_space += stats.unused_pages * stats.space_per_page; /* * Total space available for tuples excludes the metapage and the bitmap * pages. */ total_space = (nblocks - (stats.bitmap_pages + 1)) * stats.space_per_page; if (total_space == 0) free_percent = 0.0; else free_percent = 100.0 * stats.free_space / total_space; /* * Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); /* * Build and return the tuple */ MemSet(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(stats.version); values[1] = Int64GetDatum((int64) stats.bucket_pages); values[2] = Int64GetDatum((int64) stats.overflow_pages); values[3] = Int64GetDatum((int64) stats.bitmap_pages); values[4] = Int64GetDatum((int64) stats.unused_pages); values[5] = Int64GetDatum(stats.live_items); values[6] = Int64GetDatum(stats.dead_items); values[7] = Float8GetDatum(free_percent); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* * btbuild() -- build a new btree index. */ Datum btbuild(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE; Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; BTBuildState buildstate; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER; buildstate.isUnique = indexInfo->ii_Unique; buildstate.haveDead = false; buildstate.heapRel = heap; buildstate.spool = NULL; buildstate.spool2 = NULL; buildstate.indtuples = 0; #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) ResetUsage(); #endif /* BTREE_BUILD_STATS */ /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); PG_TRY(); { buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false); /* * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. */ if (indexInfo->ii_Unique) buildstate.spool2 = _bt_spoolinit(index, false, true); /* do the heap scan */ reltuples = IndexBuildScan(heap, index, indexInfo, false, btbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ if (buildstate.spool2 && !buildstate.haveDead) { /* spool2 turns out to be unnecessary */ _bt_spooldestroy(buildstate.spool2); buildstate.spool2 = NULL; } /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. */ _bt_leafbuild(buildstate.spool, buildstate.spool2); _bt_spooldestroy(buildstate.spool); buildstate.spool = NULL; if (buildstate.spool2) { _bt_spooldestroy(buildstate.spool2); buildstate.spool2 = NULL; } } PG_CATCH(); { /* Clean up the sort state on error */ if (buildstate.spool) { _bt_spooldestroy(buildstate.spool); buildstate.spool = NULL; } if (buildstate.spool2) { _bt_spooldestroy(buildstate.spool2); buildstate.spool2 = NULL; } PG_RE_THROW(); } PG_END_TRY(); #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) { ShowUsage("BTREE BUILD STATS"); ResetUsage(); } #endif /* BTREE_BUILD_STATS */ /* * If we are reindexing a pre-existing index, it is critical to send out a * relcache invalidation SI message to ensure all backends re-read the * index metapage. We expect that the caller will ensure that happens * (typically as a side effect of updating index stats, but it must happen * even if the stats don't change!) */ /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT; PG_RETURN_POINTER(result); }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; Page page; XLogBeginInsert(); XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is used to hold data not yet * dumped out to the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_SIZES); /* * create a temporary memory context that is used for calling * ginExtractEntries(), and can be reset after each tuple */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_SIZES); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate, NULL); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.funcCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages; GinState ginstate; GinStatsData idxStat; /* * In an autovacuum analyze, we want to clean up pending insertions. * Otherwise, an ANALYZE-only call is a no-op. */ if (info->analyze_only) { if (IsAutoVacuumWorkerProcess()) { initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } PG_RETURN_POINTER(stats); } /* * Set up all-zero stats and cleanup pending inserts if ginbulkdelete * wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } memset(&idxStat, 0, sizeof(idxStat)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); totFreePages = 0; for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); totFreePages++; } else if (GinPageIsData(page)) { idxStat.nDataPages++; } else if (!GinPageIsList(page)) { idxStat.nEntryPages++; if (GinPageIsLeaf(page)) idxStat.nEntries += PageGetMaxOffsetNumber(page); } UnlockReleaseBuffer(buffer); } /* Update the metapage with accurate page and entry counts */ idxStat.nTotalPages = npages; ginUpdateStats(info->index, &idxStat); /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_SIZES); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * Check to see if we need to issue one final WAL record for this index, * which may be needed for correctness on a hot standby node when non-MVCC * index scans could take place. * * If the WAL is replayed in hot standby, the replay process needs to get * cleanup locks on all index leaf pages, just as we've been doing here. * However, we won't issue any WAL records about pages that have no items * to be deleted. For pages between pages we've vacuumed, the replay code * will take locks under the direction of the lastBlockVacuumed fields in * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; /* * The page should be valid, but we can't use _bt_getbuf() because we * want to use a nondefault buffer access strategy. Since we aren't * going to delete any items, getting cleanup lock again is probably * overkill, but for consistency do that anyway. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; }
Datum spgstat(PG_FUNCTION_ARGS) { text *name=PG_GETARG_TEXT_P(0); char *relname=text_to_cstring(name); RangeVar *relvar; Relation index; List *relname_list; Oid relOid; BlockNumber blkno = SPGIST_HEAD_BLKNO; BlockNumber totalPages = 0, innerPages = 0, emptyPages = 0; double usedSpace = 0.0; char res[1024]; int bufferSize = -1; int64 innerTuples = 0, leafTuples = 0; relname_list = stringToQualifiedNameList(relname); relvar = makeRangeVarFromNameList(relname_list); relOid = RangeVarGetRelid(relvar, false); index = index_open(relOid, AccessExclusiveLock); if ( index->rd_am == NULL ) elog(ERROR, "Relation %s.%s is not an index", get_namespace_name(RelationGetNamespace(index)), RelationGetRelationName(index) ); totalPages = RelationGetNumberOfBlocks(index); for(blkno=SPGIST_HEAD_BLKNO; blkno<totalPages; blkno++) { Buffer buffer; Page page; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); if (SpGistPageIsLeaf(page)) { leafTuples += SpGistPageGetMaxOffset(page); } else { innerPages++; innerTuples += SpGistPageGetMaxOffset(page); } if (bufferSize < 0) bufferSize = BufferGetPageSize(buffer) - MAXALIGN(sizeof(SpGistPageOpaqueData)) - SizeOfPageHeaderData; usedSpace += bufferSize - (PageGetFreeSpace(page) + sizeof(ItemIdData)); if (PageGetFreeSpace(page) + sizeof(ItemIdData) == bufferSize) emptyPages++; UnlockReleaseBuffer(buffer); } index_close(index, AccessExclusiveLock); totalPages--; /* metapage */ snprintf(res, sizeof(res), "totalPages: %u\n" "innerPages: %u\n" "leafPages: %u\n" "emptyPages: %u\n" "usedSpace: %.2f kbytes\n" "freeSpace: %.2f kbytes\n" "fillRatio: %.2f%c\n" "leafTuples: %lld\n" "innerTuples: %lld", totalPages, innerPages, totalPages - innerPages, emptyPages, usedSpace / 1024.0, (( (double) bufferSize ) * ( (double) totalPages ) - usedSpace) / 1024, 100.0 * ( usedSpace / (( (double) bufferSize ) * ( (double) totalPages )) ), '%', leafTuples, innerTuples ); PG_RETURN_TEXT_P(CStringGetTextDatum(res)); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->canreturn = index_can_return(indexRelation); info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearcharray = indexRelation->rd_am->amsearcharray; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(indexRelation->rd_am->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (indexRelation->rd_am->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }