/* * Open block directory relation, initialize scan keys and minipages * for ALTER TABLE ADD COLUMN operation. */ void AppendOnlyBlockDirectory_Init_addCol( AppendOnlyBlockDirectory *blockDirectory, AppendOnlyEntry *aoEntry, Snapshot appendOnlyMetaDataSnapshot, FileSegInfo *segmentFileInfo, Relation aoRel, int segno, int numColumnGroups, bool isAOCol) { Assert(aoEntry != NULL); blockDirectory->aoRel = aoRel; blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; if (!OidIsValid(aoEntry->blkdirrelid)) { Assert(!OidIsValid(aoEntry->blkdiridxid)); blockDirectory->blkdirRel = NULL; blockDirectory->blkdirIdx = NULL; blockDirectory->numColumnGroups = 0; return; } blockDirectory->segmentFileInfo = NULL; blockDirectory->totalSegfiles = -1; blockDirectory->currentSegmentFileInfo = segmentFileInfo; blockDirectory->currentSegmentFileNum = segno; blockDirectory->numColumnGroups = numColumnGroups; blockDirectory->isAOCol = isAOCol; blockDirectory->proj = NULL; Assert(OidIsValid(aoEntry->blkdirrelid)); /* * TODO: refactor the *_addCol* interface so that opening of * blockdirectory relation and index, init_internal and * corresponding cleanup in *_End_addCol() is called only once * during the add-column operation. Currently, this is being * called for every appendonly segment. */ blockDirectory->blkdirRel = heap_open(aoEntry->blkdirrelid, RowExclusiveLock); Assert(OidIsValid(aoEntry->blkdiridxid)); blockDirectory->blkdirIdx = index_open(aoEntry->blkdiridxid, RowExclusiveLock); init_internal(blockDirectory); }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel, Datum value) { struct varlena *attr = (struct varlena *) DatumGetPointer(value); struct varatt_external toast_pointer; Relation toastrel; Relation toastidx; ScanKeyData toastkey; SysScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); /* * Open the toast relation and its index */ toastrel = heap_open(toast_pointer.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock); /* * Setup a scan key to find chunks with matching va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Find all the chunks. (We don't actually care whether we see them in * sequence or not, but since we've already locked the index we might as * well use systable_beginscan_ordered.) */ toastscan = systable_beginscan_ordered(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ systable_endscan_ordered(toastscan); index_close(toastidx, RowExclusiveLock); heap_close(toastrel, RowExclusiveLock); }
/* * AppendOnlyBlockDirectory_Init_forSearch * * Initialize the block directory to handle the lookup. * * If the block directory relation for this appendonly relation * does not exist before calling this function, set blkdirRel * and blkdirIdx to NULL, and return. */ void AppendOnlyBlockDirectory_Init_forSearch( AppendOnlyBlockDirectory *blockDirectory, AppendOnlyEntry *aoEntry, Snapshot appendOnlyMetaDataSnapshot, FileSegInfo **segmentFileInfo, int totalSegfiles, Relation aoRel, int numColumnGroups, bool isAOCol, bool *proj) { Assert(aoEntry != NULL); blockDirectory->aoRel = aoRel; if (!OidIsValid(aoEntry->blkdirrelid)) { Assert(!OidIsValid(aoEntry->blkdiridxid)); blockDirectory->blkdirRel = NULL; blockDirectory->blkdirIdx = NULL; return; } ereportif(Debug_appendonly_print_blockdirectory, LOG, (errmsg("Append-only block directory init for search: " "(totalSegfiles, numColumnGroups, isAOCol)=" "(%d, %d, %d)", totalSegfiles, numColumnGroups, isAOCol))); blockDirectory->segmentFileInfo = segmentFileInfo; blockDirectory->totalSegfiles = totalSegfiles; blockDirectory->aoRel = aoRel; blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; blockDirectory->numColumnGroups = numColumnGroups; blockDirectory->isAOCol = isAOCol; blockDirectory->proj = proj; blockDirectory->currentSegmentFileNum = -1; Assert(OidIsValid(aoEntry->blkdirrelid)); blockDirectory->blkdirRel = heap_open(aoEntry->blkdirrelid, AccessShareLock); Assert(OidIsValid(aoEntry->blkdiridxid)); blockDirectory->blkdirIdx = index_open(aoEntry->blkdiridxid, AccessShareLock); init_internal(blockDirectory); }
/* * GetNewOid * Generate a new OID that is unique within the given relation. * * Caller must have a suitable lock on the relation. * * Uniqueness is promised only if the relation has a unique index on OID. * This is true for all system catalogs that have OIDs, but might not be * true for user tables. Note that we are effectively assuming that the * table has a relatively small number of entries (much less than 2^32) * and there aren't very long runs of consecutive existing OIDs. Again, * this is reasonable for system catalogs but less so for user tables. * * Since the OID is not immediately inserted into the table, there is a * race condition here; but a problem could occur only if someone else * managed to cycle through 2^32 OIDs and generate the same OID before we * finish inserting our row. This seems unlikely to be a problem. Note * that if we had to *commit* the row to end the race condition, the risk * would be rather higher; therefore we use SnapshotDirty in the test, * so that we will see uncommitted rows. */ Oid GetNewOid(Relation relation) { Oid newOid; Oid oidIndex; Relation indexrel; /* If relation doesn't have OIDs at all, caller is confused */ Assert(relation->rd_rel->relhasoids); /* In bootstrap mode, we don't have any indexes to use */ if (IsBootstrapProcessingMode()) return GetNewObjectId(); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(relation); /* If no OID index, just hand back the next OID counter value */ if (!OidIsValid(oidIndex)) { /* * System catalogs that have OIDs should *always* have a unique OID * index; we should only take this path for user tables. Give a * warning if it looks like somebody forgot an index. */ if (IsSystemRelation(relation)) elog(WARNING, "generating possibly-non-unique OID for \"%s\"", RelationGetRelationName(relation)); return GetNewObjectId(); } /* Otherwise, use the index to find a nonconflicting OID */ indexrel = index_open(oidIndex, AccessShareLock); newOid = GetNewOidWithIndex(relation, indexrel); index_close(indexrel, AccessShareLock); /* * Most catalog objects need to have the same OID in the master and all * segments. When creating a new object, the master should allocate the * OID and tell the segments to use the same, so segments should have no * need to ever allocate OIDs on their own. Therefore, give a WARNING if * GetNewOid() is called in a segment. (There are a few exceptions, see * RelationNeedsSynchronizedOIDs()). */ if (Gp_role == GP_ROLE_EXECUTE && RelationNeedsSynchronizedOIDs(relation)) elog(PANIC, "allocated OID %u for relation \"%s\" in segment", newOid, RelationGetRelationName(relation)); return newOid; }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel __attribute__((unused)), Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(attr->va_external.va_valueid)); /* * Find all the chunks. (We don't actually care whether we see them in * sequence or not, but since we've already locked the index we might as * well use systable_beginscan_ordered.) */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, RowExclusiveLock); heap_close(toastrel, RowExclusiveLock); }
/* * systable_beginscan --- set up for heap-or-index scan * * rel: catalog to scan, already opened and suitably locked * indexId: OID of index to conditionally use * indexOK: if false, forces a heap scan (see notes below) * snapshot: time qual to use (usually should be SnapshotNow) * nkeys, key: scan keys * * The attribute numbers in the scan key should be set for the heap case. * If we choose to index, we reset them to 1..n to reference the index * columns. Note this means there must be one scankey qualification per * index column! This is checked by the Asserts in the normal, index-using * case, but won't be checked if the heapscan path is taken. * * The routine checks the normal cases for whether an indexscan is safe, * but caller can make additional checks and pass indexOK=false if needed. * In standard case indexOK can simply be constant TRUE. */ SysScanDesc systable_beginscan(Relation heapRelation, Oid indexId, bool indexOK, Snapshot snapshot, int nkeys, ScanKey key) { SysScanDesc sysscan; Relation irel; if (indexOK && !IgnoreSystemIndexes && !ReindexIsProcessingIndex(indexId)) irel = index_open(indexId, AccessShareLock); else irel = NULL; sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData)); sysscan->heap_rel = heapRelation; sysscan->irel = irel; if (irel) { int i; /* * Change attribute numbers to be index column numbers. * * This code could be generalized to search for the index key numbers * to substitute, but for now there's no need. */ for (i = 0; i < nkeys; i++) { Assert(key[i].sk_attno == irel->rd_index->indkey.values[i]); key[i].sk_attno = i + 1; } sysscan->iscan = index_beginscan(heapRelation, irel, snapshot, nkeys, key); sysscan->scan = NULL; } else { sysscan->scan = heap_beginscan(heapRelation, snapshot, nkeys, key); sysscan->iscan = NULL; } return sysscan; }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel, Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Find the chunks by index */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, RowExclusiveLock); }
/* * InitCatCachePhase2 -- external interface for CatalogCacheInitializeCache * * One reason to call this routine is to ensure that the relcache has * created entries for all the catalogs and indexes referenced by catcaches. * Therefore, provide an option to open the index as well as fixing the * cache itself. An exception is the indexes on pg_am, which we don't use * (cf. IndexScanOK). */ void InitCatCachePhase2(CatCache *cache, bool touch_index) { if (cache->cc_tupdesc == NULL) CatalogCacheInitializeCache(cache); if (touch_index && cache->id != AMOID && cache->id != AMNAME) { Relation idesc; idesc = index_open(cache->cc_indexoid, AccessShareLock); index_close(idesc, AccessShareLock); } }
/* * Inits the visimap store. * The store is ready for usage after this function call. * * Assumes a zero-allocated visimap store data structure. * Assumes that the visimap memory context is active. */ void AppendOnlyVisimapStore_Init( AppendOnlyVisimapStore *visiMapStore, Oid visimapRelid, Oid visimapIdxid, LOCKMODE lockmode, Snapshot snapshot, MemoryContext memoryContext) { TupleDesc heapTupleDesc; ScanKey scanKey; Assert(visiMapStore); Assert(CurrentMemoryContext == memoryContext); Assert(OidIsValid(visimapRelid)); Assert(OidIsValid(visimapIdxid)); visiMapStore->snapshot = snapshot; visiMapStore->memoryContext = memoryContext; visiMapStore->visimapRelation = heap_open( visimapRelid, lockmode); visiMapStore->visimapIndex = index_open( visimapIdxid, lockmode); heapTupleDesc = RelationGetDescr(visiMapStore->visimapRelation); Assert(heapTupleDesc->natts == Natts_pg_aovisimap); visiMapStore->scanKeys = palloc0(sizeof(ScanKeyData) * APPENDONLY_VISIMAP_INDEX_SCAN_KEY_NUM); // scan key: segno scanKey = visiMapStore->scanKeys; ScanKeyInit(scanKey, Anum_pg_aovisimap_segno, /* segno */ BTEqualStrategyNumber, F_INT4EQ, 0); // scan key: firstRowNum scanKey++; ScanKeyInit(scanKey, Anum_pg_aovisimap_firstrownum, /* attribute number to scan */ BTEqualStrategyNumber, /* strategy */ F_INT8EQ, /* reg proc to use */ 0); }
/* * SQL-callable function to clean the insert pending list */ Datum gin_clean_pending_list(PG_FUNCTION_ARGS) { Oid indexoid = PG_GETARG_OID(0); Relation indexRel = index_open(indexoid, AccessShareLock); IndexBulkDeleteResult stats; GinState ginstate; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("GIN pending list cannot be cleaned up during recovery."))); /* Must be a GIN index */ if (indexRel->rd_rel->relkind != RELKIND_INDEX || indexRel->rd_rel->relam != GIN_AM_OID) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a GIN index", RelationGetRelationName(indexRel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(indexRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary indexes of other sessions"))); /* User must own the index (comparable to privileges needed for VACUUM) */ if (!pg_class_ownercheck(indexoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, RelationGetRelationName(indexRel)); memset(&stats, 0, sizeof(stats)); initGinState(&ginstate, indexRel); ginInsertCleanup(&ginstate, true, true, &stats); index_close(indexRel, AccessShareLock); PG_RETURN_INT64((int64) stats.pages_deleted); }
/* * enum_endpoint: common code for enum_first/enum_last */ static Oid enum_endpoint(Oid enumtypoid, ScanDirection direction) { Relation enum_rel; Relation enum_idx; SysScanDesc enum_scan; HeapTuple enum_tuple; ScanKeyData skey; Oid minmax; /* * Find the first/last enum member using pg_enum_typid_sortorder_index. * Note we must not use the syscache. See comments for RenumberEnumType * in catalog/pg_enum.c for more info. */ ScanKeyInit(&skey, Anum_pg_enum_enumtypid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(enumtypoid)); enum_rel = heap_open(EnumRelationId, AccessShareLock); enum_idx = index_open(EnumTypIdSortOrderIndexId, AccessShareLock); enum_scan = systable_beginscan_ordered(enum_rel, enum_idx, NULL, 1, &skey); enum_tuple = systable_getnext_ordered(enum_scan, direction); if (HeapTupleIsValid(enum_tuple)) { /* check it's safe to use in SQL */ check_safe_enum_use(enum_tuple); minmax = HeapTupleGetOid(enum_tuple); } else { /* should only happen with an empty enum */ minmax = InvalidOid; } systable_endscan_ordered(enum_scan); index_close(enum_idx, AccessShareLock); heap_close(enum_rel, AccessShareLock); return minmax; }
/* * SQL-callable function to scan through an index and summarize all ranges * that are not currently summarized. */ Datum brin_summarize_new_values(PG_FUNCTION_ARGS) { Oid indexoid = PG_GETARG_OID(0); Relation indexRel; Relation heapRel; double numSummarized = 0; heapRel = heap_open(IndexGetRelation(indexoid, false), ShareUpdateExclusiveLock); indexRel = index_open(indexoid, ShareUpdateExclusiveLock); brinsummarize(indexRel, heapRel, &numSummarized, NULL); relation_close(indexRel, ShareUpdateExclusiveLock); relation_close(heapRel, ShareUpdateExclusiveLock); PG_RETURN_INT32((int32) numSummarized); }
/* * AppendOnlyBlockDirectory_DeleteSegmentFile * * Deletes all block directory entries for given segment file of an * append-only relation. */ void AppendOnlyBlockDirectory_DeleteSegmentFile( AppendOnlyEntry *aoEntry, Snapshot snapshot, int segno, int columnGroupNo) { Assert(aoEntry); Assert(OidIsValid(aoEntry->blkdirrelid)); Assert(OidIsValid(aoEntry->blkdiridxid)); Relation blkdirRel = heap_open(aoEntry->blkdirrelid, RowExclusiveLock); Relation blkdirIdx = index_open(aoEntry->blkdiridxid, RowExclusiveLock); ScanKeyData scanKey; ScanKeyInit(&scanKey, 1, /* segno */ BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); IndexScanDesc indexScan = index_beginscan( blkdirRel, blkdirIdx, snapshot, 1, &scanKey); HeapTuple tuple = NULL; while ((tuple = index_getnext(indexScan, ForwardScanDirection)) != NULL) { simple_heap_delete(blkdirRel, &tuple->t_self); } index_endscan(indexScan); index_close(blkdirIdx, RowExclusiveLock); heap_close(blkdirRel, RowExclusiveLock); }
/* * enum_endpoint: common code for enum_first/enum_last */ static Oid enum_endpoint(Oid enumtypoid, ScanDirection direction) { Relation enum_rel; Relation enum_idx; SysScanDesc enum_scan; HeapTuple enum_tuple; ScanKeyData skey; Oid minmax; /* * Find the first/last enum member using pg_enum_typid_sortorder_index. * Note we must not use the syscache, and must use an MVCC snapshot here. * See comments for RenumberEnumType in catalog/pg_enum.c for more info. */ ScanKeyInit(&skey, Anum_pg_enum_enumtypid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(enumtypoid)); enum_rel = heap_open(EnumRelationId, AccessShareLock); enum_idx = index_open(EnumTypIdSortOrderIndexId, AccessShareLock); enum_scan = systable_beginscan_ordered(enum_rel, enum_idx, GetTransactionSnapshot(), 1, &skey); enum_tuple = systable_getnext_ordered(enum_scan, direction); if (HeapTupleIsValid(enum_tuple)) minmax = HeapTupleGetOid(enum_tuple); else minmax = InvalidOid; systable_endscan_ordered(enum_scan); index_close(enum_idx, AccessShareLock); heap_close(enum_rel, AccessShareLock); return minmax; }
void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) { Relation LocalNewHeap, LocalOldHeap, LocalOldIndex; IndexScanDesc ScanDesc; RetrieveIndexResult ScanResult; ItemPointer HeapTid; HeapTuple LocalHeapTuple; Buffer LocalBuffer; Oid OIDNewHeapInsert; /* * Open the relations I need. Scan through the OldHeap on the OldIndex and * insert each tuple into the NewHeap. */ LocalNewHeap=(Relation)heap_open(OIDNewHeap); LocalOldHeap=(Relation)heap_open(OIDOldHeap); LocalOldIndex=(Relation)index_open(OIDOldIndex); ScanDesc=index_beginscan(LocalOldIndex, false, 0, (ScanKey) NULL); while ((ScanResult = index_getnext(ScanDesc, ForwardScanDirection)) != NULL) { HeapTid = &ScanResult->heap_iptr; LocalHeapTuple = heap_fetch(LocalOldHeap, 0, HeapTid, &LocalBuffer); OIDNewHeapInsert = heap_insert(LocalNewHeap, LocalHeapTuple); pfree(ScanResult); ReleaseBuffer(LocalBuffer); } index_close(LocalOldIndex); heap_close(LocalOldHeap); heap_close(LocalNewHeap); }
/* * InitCatCachePhase2 -- external interface for CatalogCacheInitializeCache * * One reason to call this routine is to ensure that the relcache has * created entries for all the catalogs and indexes referenced by catcaches. * Therefore, provide an option to open the index as well as fixing the * cache itself. An exception is the indexes on pg_am, which we don't use * (cf. IndexScanOK). */ void InitCatCachePhase2(CatCache *cache, bool touch_index) { if (cache->cc_tupdesc == NULL) CatalogCacheInitializeCache(cache); if (touch_index && cache->id != AMOID && cache->id != AMNAME) { Relation idesc; /* * We must lock the underlying catalog before opening the index to * avoid deadlock, since index_open could possibly result in reading * this same catalog, and if anyone else is exclusive-locking this * catalog and index they'll be doing it in that order. */ LockRelationOid(cache->cc_reloid, AccessShareLock); idesc = index_open(cache->cc_indexoid, AccessShareLock); index_close(idesc, AccessShareLock); UnlockRelationOid(cache->cc_reloid, AccessShareLock); } }
/* ---------------------------------------------------------------- * ExecInitBitmapIndexScan * * Initializes the index scan's state information. * ---------------------------------------------------------------- */ BitmapIndexScanState * ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) { BitmapIndexScanState *indexstate; bool relistarget; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); /* * create state structure */ indexstate = makeNode(BitmapIndexScanState); indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; /* normally we don't make the result bitmap till runtime */ indexstate->biss_result = NULL; /* * Miscellaneous initialization * * We do not need a standard exprcontext for this node, though we may * decide below to create a runtime-key exprcontext */ /* * initialize child expressions * * We don't need to initialize targetlist or qual since neither are used. * * Note: we don't initialize all of the indexqual expression, only the * sub-parts corresponding to runtime keys (see below). */ /* * We do not open or lock the base relation here. We assume that an * ancestor BitmapHeapScan node is holding AccessShareLock (or better) on * the heap relation throughout the execution of the plan tree. */ indexstate->ss.ss_currentRelation = NULL; indexstate->ss.ss_currentScanDesc = NULL; /* * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop * here. This allows an index-advisor plugin to EXPLAIN a plan containing * references to nonexistent indexes. */ if (eflags & EXEC_FLAG_EXPLAIN_ONLY) return indexstate; /* * Open the index relation. * * If the parent table is one of the target relations of the query, then * InitPlan already opened and write-locked the index, so we can avoid * taking another lock here. Otherwise we need a normal reader's lock. */ relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid); indexstate->biss_RelationDesc = index_open(node->indexid, relistarget ? NoLock : AccessShareLock); /* * Initialize index-specific scan state */ indexstate->biss_RuntimeKeysReady = false; indexstate->biss_RuntimeKeys = NULL; indexstate->biss_NumRuntimeKeys = 0; /* * build the index scan keys from the index qualification */ ExecIndexBuildScanKeys((PlanState *) indexstate, indexstate->biss_RelationDesc, node->indexqual, false, &indexstate->biss_ScanKeys, &indexstate->biss_NumScanKeys, &indexstate->biss_RuntimeKeys, &indexstate->biss_NumRuntimeKeys, &indexstate->biss_ArrayKeys, &indexstate->biss_NumArrayKeys); /* * If we have runtime keys or array keys, we need an ExprContext to * evaluate them. We could just create a "standard" plan node exprcontext, * but to keep the code looking similar to nodeIndexscan.c, it seems * better to stick with the approach of using a separate ExprContext. */ if (indexstate->biss_NumRuntimeKeys != 0 || indexstate->biss_NumArrayKeys != 0) { ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; ExecAssignExprContext(estate, &indexstate->ss.ps); indexstate->biss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; indexstate->ss.ps.ps_ExprContext = stdecontext; } else { indexstate->biss_RuntimeContext = NULL; } /* * Initialize scan descriptor. */ indexstate->biss_ScanDesc = index_beginscan_bitmap(indexstate->biss_RelationDesc, estate->es_snapshot, indexstate->biss_NumScanKeys); /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. */ if (indexstate->biss_NumRuntimeKeys == 0 && indexstate->biss_NumArrayKeys == 0) index_rescan(indexstate->biss_ScanDesc, indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys, NULL, 0); /* * all done. */ return indexstate; }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Allocate per-column info arrays. To save a few palloc cycles * we allocate all the Oid-type arrays in one request. Note that * the opfamily array needs an extra, terminating zero at the end. * We pre-zero the ordering info in case the index is unordered. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->opfamily = (Oid *) palloc0(sizeof(Oid) * (4 * ncolumns + 1)); info->opcintype = info->opfamily + (ncolumns + 1); info->fwdsortop = info->opcintype + ncolumns; info->revsortop = info->fwdsortop + ncolumns; info->nulls_first = (bool *) palloc0(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering operators associated with the index, if any. * We expect that all ordering-capable indexes use btree's * strategy numbers for the ordering operators. */ if (indexRelation->rd_am->amcanorder) { int nstrat = indexRelation->rd_am->amstrategies; for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; int fwdstrat; int revstrat; if (opt & INDOPTION_DESC) { fwdstrat = BTGreaterStrategyNumber; revstrat = BTLessStrategyNumber; } else { fwdstrat = BTLessStrategyNumber; revstrat = BTGreaterStrategyNumber; } /* * Index AM must have a fixed set of strategies for it to * make sense to specify amcanorder, so we need not allow * the case amstrategies == 0. */ if (fwdstrat > 0) { Assert(fwdstrat <= nstrat); info->fwdsortop[i] = indexRelation->rd_operator[i * nstrat + fwdstrat - 1]; } if (revstrat > 0) { Assert(revstrat <= nstrat); info->revsortop[i] = indexRelation->rd_operator[i * nstrat + revstrat - 1]; } info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->canreturn = index_can_return(indexRelation); info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearcharray = indexRelation->rd_am->amsearcharray; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(indexRelation->rd_am->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (indexRelation->rd_am->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
Datum spgstat(PG_FUNCTION_ARGS) { text *name=PG_GETARG_TEXT_P(0); char *relname=text_to_cstring(name); RangeVar *relvar; Relation index; List *relname_list; Oid relOid; BlockNumber blkno = SPGIST_HEAD_BLKNO; BlockNumber totalPages = 0, innerPages = 0, emptyPages = 0; double usedSpace = 0.0; char res[1024]; int bufferSize = -1; int64 innerTuples = 0, leafTuples = 0; relname_list = stringToQualifiedNameList(relname); relvar = makeRangeVarFromNameList(relname_list); relOid = RangeVarGetRelid(relvar, false); index = index_open(relOid, AccessExclusiveLock); if ( index->rd_am == NULL ) elog(ERROR, "Relation %s.%s is not an index", get_namespace_name(RelationGetNamespace(index)), RelationGetRelationName(index) ); totalPages = RelationGetNumberOfBlocks(index); for(blkno=SPGIST_HEAD_BLKNO; blkno<totalPages; blkno++) { Buffer buffer; Page page; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); if (SpGistPageIsLeaf(page)) { leafTuples += SpGistPageGetMaxOffset(page); } else { innerPages++; innerTuples += SpGistPageGetMaxOffset(page); } if (bufferSize < 0) bufferSize = BufferGetPageSize(buffer) - MAXALIGN(sizeof(SpGistPageOpaqueData)) - SizeOfPageHeaderData; usedSpace += bufferSize - (PageGetFreeSpace(page) + sizeof(ItemIdData)); if (PageGetFreeSpace(page) + sizeof(ItemIdData) == bufferSize) emptyPages++; UnlockReleaseBuffer(buffer); } index_close(index, AccessExclusiveLock); totalPages--; /* metapage */ snprintf(res, sizeof(res), "totalPages: %u\n" "innerPages: %u\n" "leafPages: %u\n" "emptyPages: %u\n" "usedSpace: %.2f kbytes\n" "freeSpace: %.2f kbytes\n" "fillRatio: %.2f%c\n" "leafTuples: %lld\n" "innerTuples: %lld", totalPages, innerPages, totalPages - innerPages, emptyPages, usedSpace / 1024.0, (( (double) bufferSize ) * ( (double) totalPages ) - usedSpace) / 1024, 100.0 * ( usedSpace / (( (double) bufferSize ) * ( (double) totalPages )) ), '%', leafTuples, innerTuples ); PG_RETURN_TEXT_P(CStringGetTextDatum(res)); }
/* ------------------------------------------------------ * pgstathashindex() * * Usage: SELECT * FROM pgstathashindex('hashindex'); * ------------------------------------------------------ */ Datum pgstathashindex(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); BlockNumber nblocks; BlockNumber blkno; Relation rel; HashIndexStat stats; BufferAccessStrategy bstrategy; HeapTuple tuple; TupleDesc tupleDesc; Datum values[8]; bool nulls[8]; Buffer metabuf; HashMetaPage metap; float8 free_percent; uint64 total_space; rel = index_open(relid, AccessShareLock); /* index_open() checks that it's an index */ if (!IS_HASH(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("relation \"%s\" is not a HASH index", RelationGetRelationName(rel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary indexes of other sessions"))); /* Get the information we need from the metapage. */ memset(&stats, 0, sizeof(stats)); metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); stats.version = metap->hashm_version; stats.space_per_page = metap->hashm_bsize; _hash_relbuf(rel, metabuf); /* Get the current relation length */ nblocks = RelationGetNumberOfBlocks(rel); /* prepare access strategy for this index */ bstrategy = GetAccessStrategy(BAS_BULKREAD); /* Start from blkno 1 as 0th block is metapage */ for (blkno = 1; blkno < nblocks; blkno++) { Buffer buf; Page page; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buf); if (PageIsNew(page)) stats.unused_pages++; else if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); else { HashPageOpaque opaque; int pagetype; opaque = (HashPageOpaque) PageGetSpecialPointer(page); pagetype = opaque->hasho_flag & LH_PAGE_TYPE; if (pagetype == LH_BUCKET_PAGE) { stats.bucket_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_OVERFLOW_PAGE) { stats.overflow_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_BITMAP_PAGE) stats.bitmap_pages++; else if (pagetype == LH_UNUSED_PAGE) stats.unused_pages++; else ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("unexpected page type 0x%04X in HASH index \"%s\" block %u", opaque->hasho_flag, RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); } UnlockReleaseBuffer(buf); } /* Done accessing the index */ index_close(rel, AccessShareLock); /* Count unused pages as free space. */ stats.free_space += stats.unused_pages * stats.space_per_page; /* * Total space available for tuples excludes the metapage and the bitmap * pages. */ total_space = (nblocks - (stats.bitmap_pages + 1)) * stats.space_per_page; if (total_space == 0) free_percent = 0.0; else free_percent = 100.0 * stats.free_space / total_space; /* * Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); /* * Build and return the tuple */ MemSet(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(stats.version); values[1] = Int64GetDatum((int64) stats.bucket_pages); values[2] = Int64GetDatum((int64) stats.overflow_pages); values[3] = Int64GetDatum((int64) stats.bitmap_pages); values[4] = Int64GetDatum((int64) stats.unused_pages); values[5] = Int64GetDatum(stats.live_items); values[6] = Int64GetDatum(stats.dead_items); values[7] = Float8GetDatum(free_percent); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum_slice(struct varlena * attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; SysScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; struct varlena *result; struct varatt_external toast_pointer; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; char *chunkdata; int32 chunksize; int32 chcpystrt; int32 chcpyend; Assert(VARATT_IS_EXTERNAL(attr)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); /* * It's nonsense to fetch slices of a compressed datum -- this isn't lo_* * we can't return a compressed datum which is meaningful to toast later */ Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); attrsize = toast_pointer.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (struct varlena *) palloc(length + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, length + VARHDRSZ); else SET_VARSIZE(result, length + VARHDRSZ); if (length == 0) return result; /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and its index */ toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index. This is either two keys or * three depending on the number of chunks. */ ScanKeyInit(&toastkey[0], (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Use equality condition for one chunk, a range condition otherwise: */ if (numchunks == 1) { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(startchunk)); ScanKeyInit(&toastkey[2], (AttrNumber) 2, BTLessEqualStrategyNumber, F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = systable_beginscan_ordered(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else if (VARATT_IS_SHORT(chunk)) { /* could happen due to heap_form_tuple doing its thing */ chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else { /* should never happen */ elog(ERROR, "found toasted toast chunk for toast value %u in %s", toast_pointer.va_valueid, RelationGetRelationName(toastrel)); chunksize = 0; /* keep compiler quiet */ chunkdata = NULL; } /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s", residx, nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s when fetching slice", chunksize, (int) TOAST_MAX_CHUNK_SIZE, residx, totalchunks, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else if (residx == totalchunks - 1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s when fetching slice", chunksize, (int) (attrsize - residx * TOAST_MAX_CHUNK_SIZE), residx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", residx, 0, totalchunks - 1, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(VARDATA(result) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, chunkdata + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u in %s", nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * End scan and close relations */ systable_endscan_ordered(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_save_datum - * * Save one single datum into the secondary relation and return * a Datum reference for it. * ---------- */ static Datum toast_save_datum(Relation rel, Datum value, int options) { Relation toastrel; Relation toastidx; HeapTuple toasttup; TupleDesc toasttupDesc; Datum t_values[3]; bool t_isnull[3]; CommandId mycid = GetCurrentCommandId(true); struct varlena *result; struct varatt_external toast_pointer; struct { struct varlena hdr; char data[TOAST_MAX_CHUNK_SIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } chunk_data; int32 chunk_size; int32 chunk_seq = 0; char *data_p; int32 data_todo; Pointer dval = DatumGetPointer(value); /* * Open the toast relation and its index. We can use the index to check * uniqueness of the OID we assign to the toasted item, even though it has * additional columns besides OID. */ toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock); /* * Get the data pointer and length, and compute va_rawsize and va_extsize. * * va_rawsize is the size of the equivalent fully uncompressed datum, so * we have to adjust for short headers. * * va_extsize is the actual size of the data payload in the toast records. */ if (VARATT_IS_SHORT(dval)) { data_p = VARDATA_SHORT(dval); data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ toast_pointer.va_extsize = data_todo; } else if (VARATT_IS_COMPRESSED(dval)) { data_p = VARDATA(dval); data_todo = VARSIZE(dval) - VARHDRSZ; /* rawsize in a compressed datum is just the size of the payload */ toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ; toast_pointer.va_extsize = data_todo; /* Assert that the numbers look like it's compressed */ Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); } else { data_p = VARDATA(dval); data_todo = VARSIZE(dval) - VARHDRSZ; toast_pointer.va_rawsize = VARSIZE(dval); toast_pointer.va_extsize = data_todo; } /* * Insert the correct table OID into the result TOAST pointer. * * Normally this is the actual OID of the target toast table, but during * table-rewriting operations such as CLUSTER, we have to insert the OID * of the table's real permanent toast table instead. rd_toastoid is set * if we have to substitute such an OID. */ if (OidIsValid(rel->rd_toastoid)) toast_pointer.va_toastrelid = rel->rd_toastoid; else toast_pointer.va_toastrelid = RelationGetRelid(toastrel); /* * Choose an unused OID within the toast table for this toast value. */ toast_pointer.va_valueid = GetNewOidWithIndex(toastrel, RelationGetRelid(toastidx), (AttrNumber) 1); /* * Initialize constant parts of the tuple data */ t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); t_values[2] = PointerGetDatum(&chunk_data); t_isnull[0] = false; t_isnull[1] = false; t_isnull[2] = false; /* * Split up the item into chunks */ while (data_todo > 0) { /* * Calculate the size of this chunk */ chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); /* * Build a tuple and store it */ t_values[1] = Int32GetDatum(chunk_seq++); SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); memcpy(VARDATA(&chunk_data), data_p, chunk_size); toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); heap_insert(toastrel, toasttup, mycid, options, NULL); /* * Create the index entry. We cheat a little here by not using * FormIndexDatum: this relies on the knowledge that the index columns * are the same as the initial columns of the table. * * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. */ index_insert(toastidx, t_values, t_isnull, &(toasttup->t_self), toastrel, toastidx->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); /* * Free memory */ heap_freetuple(toasttup); /* * Move on to next chunk */ data_todo -= chunk_size; data_p += chunk_size; } /* * Done - close toast relation */ index_close(toastidx, RowExclusiveLock); heap_close(toastrel, RowExclusiveLock); /* * Create the TOAST pointer value that we'll return */ result = (struct varlena *) palloc(TOAST_POINTER_SIZE); SET_VARSIZE_EXTERNAL(result, TOAST_POINTER_SIZE); memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); return PointerGetDatum(result); }
/* ---------- * toast_save_datum - * * Save one single datum into the secondary relation and return * a varattrib reference for it. * ---------- */ static Datum toast_save_datum(Relation rel, Datum value) { Relation toastrel; Relation toastidx; HeapTuple toasttup; InsertIndexResult idxres; TupleDesc toasttupDesc; Datum t_values[3]; char t_nulls[3]; varattrib *result; struct { struct varlena hdr; char data[TOAST_MAX_CHUNK_SIZE]; } chunk_data; int32 chunk_size; int32 chunk_seq = 0; char *data_p; int32 data_todo; /* * Create the varattrib reference */ result = (varattrib *) palloc(sizeof(varattrib)); result->va_header = sizeof(varattrib) | VARATT_FLAG_EXTERNAL; if (VARATT_IS_COMPRESSED(value)) { result->va_header |= VARATT_FLAG_COMPRESSED; result->va_content.va_external.va_rawsize = ((varattrib *) value)->va_content.va_compressed.va_rawsize; } else result->va_content.va_external.va_rawsize = VARATT_SIZE(value); result->va_content.va_external.va_extsize = VARATT_SIZE(value) - VARHDRSZ; result->va_content.va_external.va_valueid = newoid(); result->va_content.va_external.va_toastrelid = rel->rd_rel->reltoastrelid; /* * Initialize constant parts of the tuple data */ t_values[0] = ObjectIdGetDatum(result->va_content.va_external.va_valueid); t_values[2] = PointerGetDatum(&chunk_data); t_nulls[0] = ' '; t_nulls[1] = ' '; t_nulls[2] = ' '; /* * Get the data to process */ data_p = VARATT_DATA(value); data_todo = VARATT_SIZE(value) - VARHDRSZ; /* * Open the toast relation */ toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Split up the item into chunks */ while (data_todo > 0) { /* * Calculate the size of this chunk */ chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); /* * Build a tuple and store it */ t_values[1] = Int32GetDatum(chunk_seq++); VARATT_SIZEP(&chunk_data) = chunk_size + VARHDRSZ; memcpy(VARATT_DATA(&chunk_data), data_p, chunk_size); toasttup = heap_formtuple(toasttupDesc, t_values, t_nulls); if (!HeapTupleIsValid(toasttup)) elog(ERROR, "failed to build TOAST tuple"); simple_heap_insert(toastrel, toasttup); /* * Create the index entry. We cheat a little here by not using * FormIndexDatum: this relies on the knowledge that the index * columns are the same as the initial columns of the table. * * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. */ idxres = index_insert(toastidx, t_values, t_nulls, &(toasttup->t_self), toastrel, toastidx->rd_index->indisunique); if (idxres == NULL) elog(ERROR, "failed to insert index entry for TOAST tuple"); /* * Free memory */ pfree(idxres); heap_freetuple(toasttup); /* * Move on to next chunk */ data_todo -= chunk_size; data_p += chunk_size; } /* * Done - close toast relation and return the reference */ index_close(toastidx); heap_close(toastrel, RowExclusiveLock); return PointerGetDatum(result); }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = attr->va_content.va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); VARATT_SIZEP(result) = length + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; if (length == 0) return (result); /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index. This is either two keys * or three depending on the number of chunks. */ ScanKeyEntryInitialize(&toastkey[0], (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Now dependent on number of chunks: */ if (numchunks == 1) { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4GE, Int32GetDatum(startchunk)); ScanKeyEntryInitialize(&toastkey[2], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARATT_DATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARATT_DATA(chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum(varattrib *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; ressize = attr->va_content.va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); VARATT_SIZEP(result) = ressize + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly * ask for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else if (residx < numchunks) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else elog(ERROR, "unexpected chunk number %d for toast value %u", residx, attr->va_content.va_external.va_valueid); /* * Copy the data into proper place in our result */ memcpy(((char *) VARATT_DATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, VARATT_DATA(chunk), chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* ---------------------------------------------------------------- * ExecInitIndexScan * * Initializes the index scan's state information, creates * scan keys, and opens the base and index relations. * * Note: index scans have 2 sets of state information because * we have to keep track of the base relation and the * index relation. * ---------------------------------------------------------------- */ IndexScanState * ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) { IndexScanState *indexstate; Relation currentRelation; bool relistarget; /* * create state structure */ indexstate = makeNode(IndexScanState); indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &indexstate->ss.ps); indexstate->ss.ps.ps_TupFromTlist = false; /* * initialize child expressions * * Note: we don't initialize all of the indexqual expression, only the * sub-parts corresponding to runtime keys (see below). Likewise for * indexorderby, if any. But the indexqualorig expression is always * initialized even though it will only be used in some uncommon cases --- * would be nice to improve that. (Problem is that any SubPlans present * in the expression must be found now...) */ indexstate->ss.ps.targetlist = (List *) ExecInitExpr((Expr *) node->scan.plan.targetlist, (PlanState *) indexstate); indexstate->ss.ps.qual = (List *) ExecInitExpr((Expr *) node->scan.plan.qual, (PlanState *) indexstate); indexstate->indexqualorig = (List *) ExecInitExpr((Expr *) node->indexqualorig, (PlanState *) indexstate); /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &indexstate->ss.ps); ExecInitScanTupleSlot(estate, &indexstate->ss); /* * open the base relation and acquire appropriate lock on it. */ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid); indexstate->ss.ss_currentRelation = currentRelation; indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ /* * get the scan type from the relation descriptor. */ ExecAssignScanType(&indexstate->ss, RelationGetDescr(currentRelation)); /* * Initialize result tuple type and projection info. */ ExecAssignResultTypeFromTL(&indexstate->ss.ps); ExecAssignScanProjectionInfo(&indexstate->ss); /* * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop * here. This allows an index-advisor plugin to EXPLAIN a plan containing * references to nonexistent indexes. */ if (eflags & EXEC_FLAG_EXPLAIN_ONLY) return indexstate; /* * Open the index relation. * * If the parent table is one of the target relations of the query, then * InitPlan already opened and write-locked the index, so we can avoid * taking another lock here. Otherwise we need a normal reader's lock. */ relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid); indexstate->iss_RelationDesc = index_open(node->indexid, relistarget ? NoLock : AccessShareLock); /* * Initialize index-specific scan state */ indexstate->iss_RuntimeKeysReady = false; indexstate->iss_RuntimeKeys = NULL; indexstate->iss_NumRuntimeKeys = 0; /* * build the index scan keys from the index qualification */ ExecIndexBuildScanKeys((PlanState *) indexstate, indexstate->iss_RelationDesc, node->scan.scanrelid, node->indexqual, false, &indexstate->iss_ScanKeys, &indexstate->iss_NumScanKeys, &indexstate->iss_RuntimeKeys, &indexstate->iss_NumRuntimeKeys, NULL, /* no ArrayKeys */ NULL); /* * any ORDER BY exprs have to be turned into scankeys in the same way */ ExecIndexBuildScanKeys((PlanState *) indexstate, indexstate->iss_RelationDesc, node->scan.scanrelid, node->indexorderby, true, &indexstate->iss_OrderByKeys, &indexstate->iss_NumOrderByKeys, &indexstate->iss_RuntimeKeys, &indexstate->iss_NumRuntimeKeys, NULL, /* no ArrayKeys */ NULL); /* * If we have runtime keys, we need an ExprContext to evaluate them. The * node's standard context won't do because we want to reset that context * for every tuple. So, build another context just like the other one... * -tgl 7/11/00 */ if (indexstate->iss_NumRuntimeKeys != 0) { ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; ExecAssignExprContext(estate, &indexstate->ss.ps); indexstate->iss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; indexstate->ss.ps.ps_ExprContext = stdecontext; } else { indexstate->iss_RuntimeContext = NULL; } /* * Initialize scan descriptor. */ indexstate->iss_ScanDesc = index_beginscan(currentRelation, indexstate->iss_RelationDesc, estate->es_snapshot, indexstate->iss_NumScanKeys, indexstate->iss_NumOrderByKeys); /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. */ if (indexstate->iss_NumRuntimeKeys == 0) index_rescan(indexstate->iss_ScanDesc, indexstate->iss_ScanKeys, indexstate->iss_NumScanKeys, indexstate->iss_OrderByKeys, indexstate->iss_NumOrderByKeys); /* * all done. */ return indexstate; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum(struct varlena * attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; SysScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; struct varlena *result; struct varatt_external toast_pointer; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; char *chunkdata; int32 chunksize; /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); ressize = toast_pointer.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (struct varlena *) palloc(ressize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, ressize + VARHDRSZ); else SET_VARSIZE(result, ressize + VARHDRSZ); /* * Open the toast relation and its index */ toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly ask * for it. */ nextidx = 0; toastscan = systable_beginscan_ordered(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else if (VARATT_IS_SHORT(chunk)) { /* could happen due to heap_form_tuple doing its thing */ chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else { /* should never happen */ elog(ERROR, "found toasted toast chunk for toast value %u in %s", toast_pointer.va_valueid, RelationGetRelationName(toastrel)); chunksize = 0; /* keep compiler quiet */ chunkdata = NULL; } /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s", residx, nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", chunksize, (int) TOAST_MAX_CHUNK_SIZE, residx, numchunks, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else if (residx == numchunks - 1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s", chunksize, (int) (ressize - residx * TOAST_MAX_CHUNK_SIZE), residx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", residx, 0, numchunks - 1, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * Copy the data into proper place in our result */ memcpy(VARDATA(result) + residx * TOAST_MAX_CHUNK_SIZE, chunkdata, chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u in %s", nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * End scan and close relations */ systable_endscan_ordered(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return result; }
/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; MemoryContext oldcontext; Tuplestorestate *tupstore; Relation indexRel; brin_column_state **columns; BrinDesc *bdesc; BrinMemTuple *dtup; Page page; OffsetNumber offset; AttrNumber attno; bool unusedItem; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize) || rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Build tuplestore to hold the result rows */ oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); indexRel = index_open(indexRelid, AccessShareLock); bdesc = brin_build_desc(indexRel); /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ columns = palloc(sizeof(brin_column_state *) * RelationGetDescr(indexRel)->natts); for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typcache[i]->type_id, &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } columns[attno - 1] = column; } offset = FirstOffsetNumber; unusedItem = false; dtup = NULL; for (;;) { Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (dtup == NULL) { ItemId itemId; /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(page, offset); if (ItemIdIsUsed(itemId)) { dtup = brin_deform_tuple(bdesc, (BrinTuple *) PageGetItem(page, itemId)); attno = 1; unusedItem = false; } else unusedItem = true; } else attno++; MemSet(nulls, 0, sizeof(nulls)); if (unusedItem) { values[0] = UInt16GetDatum(offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = attno - 1; values[0] = UInt16GetDatum(offset); values[1] = UInt32GetDatum(dtup->bt_blkno); values[2] = UInt16GetDatum(attno); values[3] = BoolGetDatum(dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(dtup->bt_placeholder); if (!dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } tuplestore_putvalues(tupstore, tupdesc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (unusedItem) offset = OffsetNumberNext(offset); else if (attno >= bdesc->bd_tupdesc->natts) { pfree(dtup); dtup = NULL; offset = OffsetNumberNext(offset); } /* * If we're beyond the end of the page, we're done. */ if (offset > PageGetMaxOffsetNumber(page)) break; } /* clean up and return the tuplestore */ brin_free_desc(bdesc); tuplestore_donestoring(tupstore); index_close(indexRel, AccessShareLock); return (Datum) 0; }
/* ---------------------------------------------------------------- * ExecOpenIndices * * Find the indices associated with a result relation, open them, * and save information about them in the result ResultRelInfo. * * At entry, caller has already opened and locked * resultRelInfo->ri_RelationDesc. * ---------------------------------------------------------------- */ void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative) { Relation resultRelation = resultRelInfo->ri_RelationDesc; List *indexoidlist; ListCell *l; int len, i; RelationPtr relationDescs; IndexInfo **indexInfoArray; resultRelInfo->ri_NumIndices = 0; /* fast path if no indexes */ if (!RelationGetForm(resultRelation)->relhasindex) return; /* * Get cached list of index OIDs */ indexoidlist = RelationGetIndexList(resultRelation); len = list_length(indexoidlist); if (len == 0) return; /* * allocate space for result arrays */ relationDescs = (RelationPtr) palloc(len * sizeof(Relation)); indexInfoArray = (IndexInfo **) palloc(len * sizeof(IndexInfo *)); resultRelInfo->ri_NumIndices = len; resultRelInfo->ri_IndexRelationDescs = relationDescs; resultRelInfo->ri_IndexRelationInfo = indexInfoArray; /* * For each index, open the index relation and save pg_index info. We * acquire RowExclusiveLock, signifying we will update the index. * * Note: we do this even if the index is not indisready; it's not worth * the trouble to optimize for the case where it isn't. */ i = 0; foreach(l, indexoidlist) { Oid indexOid = lfirst_oid(l); Relation indexDesc; IndexInfo *ii; indexDesc = index_open(indexOid, RowExclusiveLock); /* extract index key information from the index's pg_index info */ ii = BuildIndexInfo(indexDesc); /* * If the indexes are to be used for speculative insertion, add extra * information required by unique index entries. */ if (speculative && ii->ii_Unique) BuildSpeculativeIndexInfo(indexDesc, ii); relationDescs[i] = indexDesc; indexInfoArray[i] = ii; i++; }