/* * Calls user picksplit method for attno columns to split vector to * two vectors. May use attno+n columns data to * get better split. * Returns TRUE and v->spl_equiv = NULL if left and right unions of attno columns are the same, * so caller may find better split * Returns TRUE and v->spl_equiv != NULL if there is tuples which may be freely moved */ static bool gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v, IndexTuple *itup, int len, GISTSTATE *giststate) { GIST_SPLITVEC *sv = &v->splitVector; /* * now let the user-defined picksplit function set up the split vector; in * entryvec there is no null value!! */ sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; sv->spl_ldatum = v->spl_lattr[attno]; sv->spl_rdatum = v->spl_rattr[attno]; FunctionCall2Coll(&giststate->picksplitFn[attno], giststate->supportCollation[attno], PointerGetDatum(entryvec), PointerGetDatum(sv)); if (sv->spl_nleft == 0 || sv->spl_nright == 0) { ereport(DEBUG1, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("picksplit method for column %d of index \"%s\" failed", attno + 1, RelationGetRelationName(r)), errhint("The index is not optimal. To optimize it, contact a developer, or try to use the column as the second one in the CREATE INDEX command."))); /* * Reinit GIST_SPLITVEC. Although that fields are not used by * genericPickSplit(), let us set up it for further processing */ sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; sv->spl_ldatum = v->spl_lattr[attno]; sv->spl_rdatum = v->spl_rattr[attno]; genericPickSplit(giststate, entryvec, sv, attno); if (sv->spl_ldatum_exists || sv->spl_rdatum_exists) supportSecondarySplit(r, giststate, attno, sv, v->spl_lattr[attno], v->spl_rattr[attno]); } else { /* compatibility with old code */ if (sv->spl_left[sv->spl_nleft - 1] == InvalidOffsetNumber) sv->spl_left[sv->spl_nleft - 1] = (OffsetNumber) (entryvec->n - 1); if (sv->spl_right[sv->spl_nright - 1] == InvalidOffsetNumber) sv->spl_right[sv->spl_nright - 1] = (OffsetNumber) (entryvec->n - 1); if (sv->spl_ldatum_exists || sv->spl_rdatum_exists) { elog(LOG, "picksplit method for column %d of index \"%s\" doesn't support secondary split", attno + 1, RelationGetRelationName(r)); supportSecondarySplit(r, giststate, attno, sv, v->spl_lattr[attno], v->spl_rattr[attno]); } } v->spl_lattr[attno] = sv->spl_ldatum; v->spl_rattr[attno] = sv->spl_rdatum; v->spl_lisnull[attno] = false; v->spl_risnull[attno] = false; /* * if index is multikey, then we must to try get smaller bounding box for * subkey(s) */ v->spl_equiv = NULL; if (giststate->tupdesc->natts > 1 && attno + 1 != giststate->tupdesc->natts) { if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum)) { /* * Left and right key's unions are equial, so we can get better * split by following columns. Note, unions for attno columns are * already done. */ return true; } else { int LenEquiv; v->spl_equiv = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1)); LenEquiv = gistfindgroup(r, giststate, entryvec->vector, v, attno); /* * if possible, we should distribute equivalent tuples */ if (LenEquiv == 0) { gistunionsubkey(giststate, itup, v, attno + 1); } else { cleanupOffsets(sv->spl_left, &sv->spl_nleft, v->spl_equiv, &LenEquiv); cleanupOffsets(sv->spl_right, &sv->spl_nright, v->spl_equiv, &LenEquiv); gistunionsubkey(giststate, itup, v, attno + 1); if (LenEquiv == 1) { /* * In case with one tuple we just choose left-right by * penalty. It's simplify user-defined pickSplit */ OffsetNumber toMove = InvalidOffsetNumber; for (toMove = FirstOffsetNumber; toMove < entryvec->n; toMove++) if (v->spl_equiv[toMove]) break; Assert(toMove < entryvec->n); placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1); /* * redo gistunionsubkey(): it will not degradate * performance, because it's very rarely */ v->spl_equiv = NULL; gistunionsubkey(giststate, itup, v, attno + 1); return false; } else if (LenEquiv > 1) return true; } } } return false; }
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; Page page; XLogBeginInsert(); XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT); XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is used to hold data not yet * dumped out to the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * create a temporary memory context that is used for calling * ginExtractEntries(), and can be reset after each tuple */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.funcCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * lazy_truncate_heap - try to truncate off any empty pages at the end */ static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) { BlockNumber old_rel_pages = vacrelstats->rel_pages; BlockNumber new_rel_pages; PGRUsage ru0; int lock_retry; pg_rusage_init(&ru0); /* * Loop until no more truncating can be done. */ do { /* * We need full exclusive lock on the relation in order to do * truncation. If we can't get it, give up rather than waiting --- we * don't want to block other backends, and we don't want to deadlock * (which is quite possible considering we already hold a lower-grade * lock). */ vacrelstats->lock_waiter_detected = false; lock_retry = 0; while (true) { if (ConditionalLockRelation(onerel, AccessExclusiveLock)) break; /* * Check for interrupts while trying to (re-)acquire the exclusive * lock. */ CHECK_FOR_INTERRUPTS(); if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) { /* * We failed to establish the lock in the specified number of * retries. This means we give up truncating. */ vacrelstats->lock_waiter_detected = true; ereport(elevel, (errmsg("\"%s\": stopping truncate due to conflicting lock request", RelationGetRelationName(onerel)))); return; } pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL); } /* * Now that we have exclusive lock, look to see if the rel has grown * whilst we were vacuuming with non-exclusive lock. If so, give up; * the newly added pages presumably contain non-deletable tuples. */ new_rel_pages = RelationGetNumberOfBlocks(onerel); if (new_rel_pages != old_rel_pages) { /* * Note: we intentionally don't update vacrelstats->rel_pages with * the new rel size here. If we did, it would amount to assuming * that the new pages are empty, which is unlikely. Leaving the * numbers alone amounts to assuming that the new pages have the * same tuple density as existing ones, which is less unlikely. */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Scan backwards from the end to verify that the end pages actually * contain no tuples. This is *necessary*, not optional, because * other backends could have added tuples to these pages whilst we * were vacuuming. */ new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); if (new_rel_pages >= old_rel_pages) { /* can't do anything after all */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Okay to truncate. */ RelationTruncate(onerel, new_rel_pages); /* * We can release the exclusive lock as soon as we have truncated. * Other backends can't safely access the relation until they have * processed the smgr invalidation that smgrtruncate sent out ... but * that should happen as part of standard invalidation processing once * they acquire lock on the relation. */ UnlockRelation(onerel, AccessExclusiveLock); /* * Update statistics. Here, it *is* correct to adjust rel_pages * without also touching reltuples, since the tuple count wasn't * changed by the truncation. */ vacrelstats->pages_removed += old_rel_pages - new_rel_pages; vacrelstats->rel_pages = new_rel_pages; ereport(elevel, (errmsg("\"%s\": truncated %u to %u pages", RelationGetRelationName(onerel), old_rel_pages, new_rel_pages), errdetail("%s.", pg_rusage_show(&ru0)))); old_rel_pages = new_rel_pages; } while (new_rel_pages > vacrelstats->nonempty_pages && vacrelstats->lock_waiter_detected); }
/* * Debugging subroutine */ static void PrintRelCacheLeakWarning(Relation rel) { elog(WARNING, "relcache reference leak: relation \"%s\" not closed", RelationGetRelationName(rel)); }
/* * Fetch local cache of AM-specific info about the index, initializing it * if necessary */ SpGistCache * spgGetCache(Relation index) { SpGistCache *cache; if (index->rd_amcache == NULL) { Oid atttype; spgConfigIn in; FmgrInfo *procinfo; Buffer metabuffer; SpGistMetaPageData *metadata; cache = MemoryContextAllocZero(index->rd_indexcxt, sizeof(SpGistCache)); /* SPGiST doesn't support multi-column indexes */ Assert(index->rd_att->natts == 1); /* * Get the actual data type of the indexed column from the index * tupdesc. We pass this to the opclass config function so that * polymorphic opclasses are possible. */ atttype = index->rd_att->attrs[0]->atttypid; /* Call the config function to get config info for the opclass */ in.attType = atttype; procinfo = index_getprocinfo(index, 1, SPGIST_CONFIG_PROC); FunctionCall2Coll(procinfo, index->rd_indcollation[0], PointerGetDatum(&in), PointerGetDatum(&cache->config)); /* Get the information we need about each relevant datatype */ fillTypeDesc(&cache->attType, atttype); fillTypeDesc(&cache->attPrefixType, cache->config.prefixType); fillTypeDesc(&cache->attLabelType, cache->config.labelType); /* Last, get the lastUsedPages data from the metapage */ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); LockBuffer(metabuffer, BUFFER_LOCK_SHARE); metadata = SpGistPageGetMeta(BufferGetPage(metabuffer)); if (metadata->magicNumber != SPGIST_MAGIC_NUMBER) elog(ERROR, "index \"%s\" is not an SP-GiST index", RelationGetRelationName(index)); cache->lastUsedPages = metadata->lastUsedPages; UnlockReleaseBuffer(metabuffer); index->rd_amcache = (void *) cache; } else { /* assume it's up to date */ cache = (SpGistCache *) index->rd_amcache; } return cache; }
/* * hashbuild() -- build a new hash index. */ IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; BlockNumber relpages; double reltuples; double allvisfrac; uint32 num_buckets; long sort_threshold; HashBuildState buildstate; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* Estimate the number of rows currently present in the table */ estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); /* Initialize the hash index metadata page and initial buckets */ num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then * (assuming their hash codes are pretty random) there will be no locality * of access to the index, and if the index is bigger than available RAM * then we'll thrash horribly. To prevent that scenario, we can sort the * tuples by (expected) bucket number. However, such a sort is useless * overhead when the index does fit in RAM. We choose to sort if the * initial index size exceeds maintenance_work_mem, or the number of * buffers usable for the index, whichever is less. (Limiting by the * number of buffers should reduce thrashing between PG buffers and kernel * buffers, which seems useful even if no physical I/O results. Limiting * by maintenance_work_mem is useful to allow easy testing of the sort * code path, and may be useful to DBAs as an additional control knob.) * * NOTE: this test will need adjustment if a bucket is ever different from * one page. Also, "initial index size" accounting does not include the * metapage, nor the first bitmap page. */ sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) sort_threshold = Min(sort_threshold, NBuffers); else sort_threshold = Min(sort_threshold, NLocBuffer); if (num_buckets >= (uint32) sort_threshold) buildstate.spool = _h_spoolinit(heap, index, num_buckets); else buildstate.spool = NULL; /* prepare to build the index */ buildstate.indtuples = 0; /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, hashbuildCallback, (void *) &buildstate); if (buildstate.spool) { /* sort the tuples and insert them into the index */ _h_indexbuild(buildstate.spool); _h_spooldestroy(buildstate.spool); } /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ Datum gistbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * We can't yet handle unlogged GiST indexes, because we depend on LSNs. * This is duplicative of an error in gistbuildempty, but we want to check * here so as to throw error before doing all the index-build work. */ if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unlogged GiST indexes are not supported"))); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * Place tuple and split page, original buffer(lbuf) leaves untouched, * returns shadow page of lbuf filled new data. * Tuples are distributed between pages by equal size on its, not * an equal number! */ static Page entrySplitPage(RumBtree btree, Buffer lbuf, Buffer rbuf, Page lPage, Page rPage, OffsetNumber off) { OffsetNumber i, maxoff, separator = InvalidOffsetNumber; Size totalsize = 0; Size lsize = 0, size; char *ptr; IndexTuple itup, leftrightmost = NULL; Page page; Page newlPage = PageGetTempPageCopy(lPage); Size pageSize = PageGetPageSize(newlPage); static char tupstore[2 * BLCKSZ]; entryPreparePage(btree, newlPage, off); maxoff = PageGetMaxOffsetNumber(newlPage); ptr = tupstore; for (i = FirstOffsetNumber; i <= maxoff; i++) { if (i == off) { size = MAXALIGN(IndexTupleSize(btree->entry)); memcpy(ptr, btree->entry, size); ptr += size; totalsize += size + sizeof(ItemIdData); } itup = (IndexTuple) PageGetItem(newlPage, PageGetItemId(newlPage, i)); size = MAXALIGN(IndexTupleSize(itup)); memcpy(ptr, itup, size); ptr += size; totalsize += size + sizeof(ItemIdData); } if (off == maxoff + 1) { size = MAXALIGN(IndexTupleSize(btree->entry)); memcpy(ptr, btree->entry, size); totalsize += size + sizeof(ItemIdData); } RumInitPage(rPage, RumPageGetOpaque(newlPage)->flags, pageSize); RumInitPage(newlPage, RumPageGetOpaque(rPage)->flags, pageSize); ptr = tupstore; maxoff++; lsize = 0; page = newlPage; for (i = FirstOffsetNumber; i <= maxoff; i++) { itup = (IndexTuple) ptr; if (lsize > totalsize / 2) { if (separator == InvalidOffsetNumber) separator = i - 1; page = rPage; } else { leftrightmost = itup; lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); ptr += MAXALIGN(IndexTupleSize(itup)); } btree->entry = RumFormInteriorTuple(btree, leftrightmost, newlPage, BufferGetBlockNumber(lbuf)); btree->rightblkno = BufferGetBlockNumber(rbuf); return newlPage; }
/* * Handle UPDATE message. * * TODO: FDW support */ static void apply_handle_update(StringInfo s) { LogicalRepRelMapEntry *rel; LogicalRepRelId relid; Oid idxoid; EState *estate; EPQState epqstate; LogicalRepTupleData oldtup; LogicalRepTupleData newtup; bool has_oldtup; TupleTableSlot *localslot; TupleTableSlot *remoteslot; bool found; MemoryContext oldctx; ensure_transaction(); relid = logicalrep_read_update(s, &has_oldtup, &oldtup, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); if (!should_apply_changes_for_rel(rel)) { /* * The relation can't become interesting in the middle of the * transaction so it's safe to unlock it. */ logicalrep_rel_close(rel, RowExclusiveLock); return; } /* Check if we can do the update. */ check_relation_updatable(rel); /* Initialize the executor state. */ estate = create_estate_for_relation(rel); remoteslot = ExecInitExtraTupleSlot(estate, RelationGetDescr(rel->localrel)); localslot = ExecInitExtraTupleSlot(estate, RelationGetDescr(rel->localrel)); EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1); PushActiveSnapshot(GetTransactionSnapshot()); ExecOpenIndices(estate->es_result_relation_info, false); /* Build the search tuple. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); slot_store_cstrings(remoteslot, rel, has_oldtup ? oldtup.values : newtup.values); MemoryContextSwitchTo(oldctx); /* * Try to find tuple using either replica identity index, primary key or * if needed, sequential scan. */ idxoid = GetRelationIdentityOrPK(rel->localrel); Assert(OidIsValid(idxoid) || (rel->remoterel.replident == REPLICA_IDENTITY_FULL && has_oldtup)); if (OidIsValid(idxoid)) found = RelationFindReplTupleByIndex(rel->localrel, idxoid, LockTupleExclusive, remoteslot, localslot); else found = RelationFindReplTupleSeq(rel->localrel, LockTupleExclusive, remoteslot, localslot); ExecClearTuple(remoteslot); /* * Tuple found. * * Note this will fail if there are other conflicting unique indexes. */ if (found) { /* Process and store remote tuple in the slot */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); ExecStoreTuple(localslot->tts_tuple, remoteslot, InvalidBuffer, false); slot_modify_cstrings(remoteslot, rel, newtup.values, newtup.changed); MemoryContextSwitchTo(oldctx); EvalPlanQualSetSlot(&epqstate, remoteslot); /* Do the actual update. */ ExecSimpleRelationUpdate(estate, &epqstate, localslot, remoteslot); } else { /* * The tuple to be updated could not be found. * * TODO what to do here, change the log level to LOG perhaps? */ elog(DEBUG1, "logical replication did not find row for update " "in replication target relation \"%s\"", RelationGetRelationName(rel->localrel)); } /* Cleanup. */ ExecCloseIndices(estate->es_result_relation_info); PopActiveSnapshot(); /* Handle queued AFTER triggers. */ AfterTriggerEndQuery(estate); EvalPlanQualEnd(&epqstate); ExecResetTupleTable(estate->es_tupleTable, false); FreeExecutorState(estate); logicalrep_rel_close(rel, NoLock); CommandCounterIncrement(); }
/* * Open a relation during XLOG replay */ Relation XLogOpenRelation(bool redo, RmgrId rmid, RelFileNode rnode) { XLogRelDesc *res; XLogRelCacheEntry *hentry; bool found; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL); if (hentry) { res = hentry->rdesc; res->lessRecently->moreRecently = res->moreRecently; res->moreRecently->lessRecently = res->lessRecently; } else { res = _xl_new_reldesc(); sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode); res->reldata.rd_node = rnode; /* * We set up the lockRelId in case anything tries to lock the * dummy relation. Note that this is fairly bogus since relNode * may be different from the relation's OID. It shouldn't really * matter though, since we are presumably running by ourselves and * can't have any lock conflicts ... */ res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode; res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (hentry == NULL) elog(PANIC, "XLogOpenRelation: out of memory for cache"); if (found) elog(PANIC, "XLogOpenRelation: file found on insert into cache"); hentry->rdesc = res; res->reldata.rd_targblock = InvalidBlockNumber; res->reldata.rd_smgr = NULL; RelationOpenSmgr(&(res->reldata)); /* * Create the target file if it doesn't already exist. This lets * us cope if the replay sequence contains writes to a relation * that is later deleted. (The original coding of this routine * would instead return NULL, causing the writes to be suppressed. * But that seems like it risks losing valuable data if the * filesystem loses an inode during a crash. Better to write the * data until we are actually told to delete the file.) */ smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true); } res->moreRecently = &(_xlrelarr[0]); res->lessRecently = _xlrelarr[0].lessRecently; _xlrelarr[0].lessRecently = res; res->lessRecently->moreRecently = res; return (&(res->reldata)); }
/* * Insert new publication / relation mapping. */ ObjectAddress publication_add_relation(Oid pubid, Relation targetrel, bool if_not_exists) { Relation rel; HeapTuple tup; Datum values[Natts_pg_publication_rel]; bool nulls[Natts_pg_publication_rel]; Oid relid = RelationGetRelid(targetrel); Oid prrelid; Publication *pub = GetPublication(pubid); ObjectAddress myself, referenced; rel = table_open(PublicationRelRelationId, RowExclusiveLock); /* * Check for duplicates. Note that this does not really prevent * duplicates, it's here just to provide nicer error message in common * case. The real protection is the unique key on the catalog. */ if (SearchSysCacheExists2(PUBLICATIONRELMAP, ObjectIdGetDatum(relid), ObjectIdGetDatum(pubid))) { table_close(rel, RowExclusiveLock); if (if_not_exists) return InvalidObjectAddress; ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("relation \"%s\" is already member of publication \"%s\"", RelationGetRelationName(targetrel), pub->name))); } check_publication_add_relation(targetrel); /* Form a tuple. */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); prrelid = GetNewOidWithIndex(rel, PublicationRelObjectIndexId, Anum_pg_publication_rel_oid); values[Anum_pg_publication_rel_oid - 1] = ObjectIdGetDatum(prrelid); values[Anum_pg_publication_rel_prpubid - 1] = ObjectIdGetDatum(pubid); values[Anum_pg_publication_rel_prrelid - 1] = ObjectIdGetDatum(relid); tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); /* Insert tuple into catalog. */ CatalogTupleInsert(rel, tup); heap_freetuple(tup); ObjectAddressSet(myself, PublicationRelRelationId, prrelid); /* Add dependency on the publication */ ObjectAddressSet(referenced, PublicationRelationId, pubid); recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); /* Add dependency on the relation */ ObjectAddressSet(referenced, RelationRelationId, relid); recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); /* Close the table. */ table_close(rel, RowExclusiveLock); /* Invalidate relcache so that publication info is rebuilt. */ CacheInvalidateRelcache(targetrel); return myself; }
/* * initGinState: fill in an empty GinState struct to describe the index * * Note: assorted subsidiary data is allocated in the CurrentMemoryContext. */ void initGinState(GinState *state, Relation index) { TupleDesc origTupdesc = RelationGetDescr(index); int i; MemSet(state, 0, sizeof(GinState)); state->index = index; state->oneCol = (origTupdesc->natts == 1) ? true : false; state->origTupdesc = origTupdesc; for (i = 0; i < origTupdesc->natts; i++) { if (state->oneCol) state->tupdesc[i] = state->origTupdesc; else { state->tupdesc[i] = CreateTemplateTupleDesc(2, false); TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, INT2OID, -1, 0); TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, origTupdesc->attrs[i]->atttypid, origTupdesc->attrs[i]->atttypmod, origTupdesc->attrs[i]->attndims); TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2, origTupdesc->attrs[i]->attcollation); } /* * If the compare proc isn't specified in the opclass definition, look * up the index key type's default btree comparator. */ if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid) { fmgr_info_copy(&(state->compareFn[i]), index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), CurrentMemoryContext); } else { TypeCacheEntry *typentry; typentry = lookup_type_cache(origTupdesc->attrs[i]->atttypid, TYPECACHE_CMP_PROC_FINFO); if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION), errmsg("could not identify a comparison function for type %s", format_type_be(origTupdesc->attrs[i]->atttypid)))); fmgr_info_copy(&(state->compareFn[i]), &(typentry->cmp_proc_finfo), CurrentMemoryContext); } /* Opclass must always provide extract procs */ fmgr_info_copy(&(state->extractValueFn[i]), index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC), CurrentMemoryContext); fmgr_info_copy(&(state->extractQueryFn[i]), index_getprocinfo(index, i + 1, GIN_EXTRACTQUERY_PROC), CurrentMemoryContext); /* * Check opclass capability to do tri-state or binary logic consistent * check. */ if (index_getprocid(index, i + 1, GIN_TRICONSISTENT_PROC) != InvalidOid) { fmgr_info_copy(&(state->triConsistentFn[i]), index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC), CurrentMemoryContext); } if (index_getprocid(index, i + 1, GIN_CONSISTENT_PROC) != InvalidOid) { fmgr_info_copy(&(state->consistentFn[i]), index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC), CurrentMemoryContext); } if (state->consistentFn[i].fn_oid == InvalidOid && state->triConsistentFn[i].fn_oid == InvalidOid) { elog(ERROR, "missing GIN support function (%d or %d) for attribute %d of index \"%s\"", GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC, i + 1, RelationGetRelationName(index)); } /* * Check opclass capability to do partial match. */ if (index_getprocid(index, i + 1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid) { fmgr_info_copy(&(state->comparePartialFn[i]), index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC), CurrentMemoryContext); state->canPartialMatch[i] = true; } else { state->canPartialMatch[i] = false; } /* * If the index column has a specified collation, we should honor that * while doing comparisons. However, we may have a collatable storage * type for a noncollatable indexed data type (for instance, hstore * uses text index entries). If there's no index collation then * specify default collation in case the support functions need * collation. This is harmless if the support functions don't care * about collation, so we just do it unconditionally. (We could * alternatively call get_typcollation, but that seems like expensive * overkill --- there aren't going to be any cases where a GIN storage * type has a nondefault collation.) */ if (OidIsValid(index->rd_indcollation[i])) state->supportCollation[i] = index->rd_indcollation[i]; else state->supportCollation[i] = DEFAULT_COLLATION_OID; } }
/* ------------------------------------------------------ * pgstatginindex() * * Usage: SELECT * FROM pgstatginindex('ginindex'); * ------------------------------------------------------ */ Datum pgstatginindex(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; Buffer buffer; Page page; GinMetaPageData *metadata; GinIndexStat stats; HeapTuple tuple; TupleDesc tupleDesc; Datum values[3]; bool nulls[3] = {false, false, false}; Datum result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); rel = relation_open(relid, AccessShareLock); if (!IS_INDEX(rel) || !IS_GIN(rel)) elog(ERROR, "relation \"%s\" is not a GIN index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary indexes of other sessions"))); /* * Read metapage */ buffer = ReadBuffer(rel, GIN_METAPAGE_BLKNO); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); metadata = GinPageGetMeta(page); stats.version = metadata->ginVersion; stats.pending_pages = metadata->nPendingPages; stats.pending_tuples = metadata->nPendingHeapTuples; UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); /* * Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); values[0] = Int32GetDatum(stats.version); values[1] = UInt32GetDatum(stats.pending_pages); values[2] = Int64GetDatum(stats.pending_tuples); /* * Build and return the tuple */ tuple = heap_form_tuple(tupleDesc, values, nulls); result = HeapTupleGetDatum(tuple); PG_RETURN_DATUM(result); }
static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) { Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.root_pages = 0; indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; CHECK_FOR_INTERRUPTS(); /* Read and lock buffer */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; else if (P_ISROOT(opaque)) indexStat.root_pages++; else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.version); values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.level); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, (indexStat.root_pages + indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j] = palloc(32); snprintf(values[j++], 32, "%u", indexStat.root_blkno); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages); values[j] = palloc(32); if (indexStat.max_avail > 0) snprintf(values[j++], 32, "%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else snprintf(values[j++], 32, "NaN"); values[j] = palloc(32); if (indexStat.leaf_pages > 0) snprintf(values[j++], 32, "%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else snprintf(values[j++], 32, "NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } return result; }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; double dnumbuckets; uint32 num_buckets; uint32 log2_num_buckets; uint32 i; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * next power of 2, however, and always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); log2_num_buckets = _hash_log2(num_buckets); Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. Since * N is a power of 2, we can set the masks this way: */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* * Initialize the first N buckets */ for (i = 0; i < num_buckets; i++) { /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; _hash_wrtbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); return num_buckets; }
/* * Handle DELETE message. * * TODO: FDW support */ static void apply_handle_delete(StringInfo s) { LogicalRepRelMapEntry *rel; LogicalRepTupleData oldtup; LogicalRepRelId relid; Oid idxoid; EState *estate; EPQState epqstate; TupleTableSlot *remoteslot; TupleTableSlot *localslot; bool found; MemoryContext oldctx; ensure_transaction(); relid = logicalrep_read_delete(s, &oldtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); if (!should_apply_changes_for_rel(rel)) { /* * The relation can't become interesting in the middle of the * transaction so it's safe to unlock it. */ logicalrep_rel_close(rel, RowExclusiveLock); return; } /* Check if we can do the delete. */ check_relation_updatable(rel); /* Initialize the executor state. */ estate = create_estate_for_relation(rel); remoteslot = ExecInitExtraTupleSlot(estate, RelationGetDescr(rel->localrel)); localslot = ExecInitExtraTupleSlot(estate, RelationGetDescr(rel->localrel)); EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1); PushActiveSnapshot(GetTransactionSnapshot()); ExecOpenIndices(estate->es_result_relation_info, false); /* Find the tuple using the replica identity index. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); slot_store_cstrings(remoteslot, rel, oldtup.values); MemoryContextSwitchTo(oldctx); /* * Try to find tuple using either replica identity index, primary key or * if needed, sequential scan. */ idxoid = GetRelationIdentityOrPK(rel->localrel); Assert(OidIsValid(idxoid) || (rel->remoterel.replident == REPLICA_IDENTITY_FULL)); if (OidIsValid(idxoid)) found = RelationFindReplTupleByIndex(rel->localrel, idxoid, LockTupleExclusive, remoteslot, localslot); else found = RelationFindReplTupleSeq(rel->localrel, LockTupleExclusive, remoteslot, localslot); /* If found delete it. */ if (found) { EvalPlanQualSetSlot(&epqstate, localslot); /* Do the actual delete. */ ExecSimpleRelationDelete(estate, &epqstate, localslot); } else { /* The tuple to be deleted could not be found. */ ereport(DEBUG1, (errmsg("logical replication could not find row for delete " "in replication target relation \"%s\"", RelationGetRelationName(rel->localrel)))); } /* Cleanup. */ ExecCloseIndices(estate->es_result_relation_info); PopActiveSnapshot(); /* Handle queued AFTER triggers. */ AfterTriggerEndQuery(estate); EvalPlanQualEnd(&epqstate); ExecResetTupleTable(estate->es_tupleTable, false); FreeExecutorState(estate); logicalrep_rel_close(rel, NoLock); CommandCounterIncrement(); }
/* * hashgettuple() -- Get the next tuple in the scan. */ bool hashgettuple(IndexScanDesc scan, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold a pin on the * primary bucket page, no deletions or splits could have occurred; * therefore we can expect that the TID still exists in the current * index page, at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DEAD state in the item flags. */ ItemIdMarkDead(PageGetItemId(page, offnum)); /* * Since this can be redone later if needed, mark as a hint. */ MarkBufferDirtyHint(buf, true); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; return res; }
/* ------------------------------------------------ * hash_bitmap_info() * * Get bitmap information for a particular overflow page * * Usage: SELECT * FROM hash_bitmap_info('con_hash_index'::regclass, 5); * ------------------------------------------------ */ Datum hash_bitmap_info(PG_FUNCTION_ARGS) { Oid indexRelid = PG_GETARG_OID(0); uint64 ovflblkno = PG_GETARG_INT64(1); HashMetaPage metap; Buffer metabuf, mapbuf; BlockNumber bitmapblkno; Page mappage; bool bit = false; TupleDesc tupleDesc; Relation indexRel; uint32 ovflbitno; int32 bitmappage, bitmapbit; HeapTuple tuple; int i, j; Datum values[3]; bool nulls[3]; uint32 *freep; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); indexRel = index_open(indexRelid, AccessShareLock); if (!IS_HASH(indexRel)) elog(ERROR, "relation \"%s\" is not a hash index", RelationGetRelationName(indexRel)); if (RELATION_IS_OTHER_TEMP(indexRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (ovflblkno >= RelationGetNumberOfBlocks(indexRel)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("block number " UINT64_FORMAT " is out of range for relation \"%s\"", ovflblkno, RelationGetRelationName(indexRel)))); /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(indexRel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Reject attempt to read the bit for a metapage or bitmap page; this is * only meaningful for overflow pages. */ if (ovflblkno == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); for (i = 0; i < metap->hashm_nmaps; i++) if (metap->hashm_mapp[i] == ovflblkno) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); /* * Identify overflow bit number. This will error out for primary bucket * pages, and we've already rejected the metapage and bitmap pages above. */ ovflbitno = _hash_ovflblkno_to_bitno(metap, (BlockNumber) ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); bitmapblkno = metap->hashm_mapp[bitmappage]; _hash_relbuf(indexRel, metabuf); /* Check the status of bitmap bit for overflow page */ mapbuf = _hash_getbuf(indexRel, bitmapblkno, HASH_READ, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); bit = ISSET(freep, bitmapbit) != 0; _hash_relbuf(indexRel, mapbuf); index_close(indexRel, AccessShareLock); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); MemSet(nulls, 0, sizeof(nulls)); j = 0; values[j++] = Int64GetDatum((int64) bitmapblkno); values[j++] = Int32GetDatum(bitmapbit); values[j++] = BoolGetDatum(bit); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
//Mandar: //This is where we return tuple according to it's offset in the file. HeapTuple csvindex_fetch_heap(IndexScanDesc scan) { //HeapTuple tuple; ItemPointer tid = &scan->xs_ctup.t_self; char *filename = (char *) palloc0(FILENAME_MAX); //This is shaky.. we are putting while path in sprintf(filename,"%s/csvinput/%s",getenv("HOME"),RelationGetRelationName(scan->heapRelation)); FILE *file = fopen(filename,"r"); int64 offset=0; char *chartuple = NULL; int iNumofAttr = scan->heapRelation->rd_rel->relnatts; int iAttr_size = 500+1; int32 size = iNumofAttr*iAttr_size + iNumofAttr + 10; char *linearr[iNumofAttr]; offset = tid->ip_blkid.bi_hi * 65535 * 65535; offset += tid->ip_blkid.bi_lo * 65535; offset += (tid->ip_posid-1); //substracting 1 as it was added to make tuplepointer valid assertion work //Allocate memory int i1=0; for(i1=0; i1<iNumofAttr; i1++) linearr[i1]=(char *)palloc0(iAttr_size); chartuple = (char *)palloc0(size); if(fseek(file,offset, SEEK_SET) != -1) { //First read row. if(fgets(chartuple,size,file)==NULL) { //No data in the file..return null tuple scan->xs_cbuf = InvalidBuffer; scan->xs_ctup.t_data = NULL; //scan->xs_ctup->t_len= newTuple->t_len; } else { //Parse and store in the array. if(!strlen(chartuple) && chartuple[0]!='\n') { //Return null tuple.. scan->xs_cbuf = InvalidBuffer; scan->xs_ctup.t_data = NULL; //scan->xs_ctup->t_len= newTuple->t_len; } else { //Form tuple. int attr=0; int i=0; while(attr < iNumofAttr && chartuple[i]!='\n') { int j=0; //printf("readme-%s---%c\n",record,record[i]); while(chartuple[i]!=',' && chartuple[i]!='\n') { linearr[attr][j++]=chartuple[i++]; } i++; linearr[attr][j]='\0'; attr++; } TupleDesc td = RelationNameGetTupleDesc(RelationGetRelationName(scan->heapRelation)); AttInMetadata *md = TupleDescGetAttInMetadata(td); HeapTuple newTuple = BuildTupleFromCStrings(md,linearr); scan->xs_cbuf = InvalidBuffer; scan->xs_ctup.t_data = newTuple->t_data; scan->xs_ctup.t_len= newTuple->t_len; //tuple->t_data = newTuple->t_data; //tuple->t_len = newTuple->t_len; //tuple->t_csvoffset = offset; //check if tuple is correct as per index definition. //if correct tuple then return the tuple else return null tuple. } } } else //Seek failed return null tuple.. { scan->xs_cbuf = InvalidBuffer; scan->xs_ctup.t_data = NULL; //scan->xs_ctup->t_len= newTuple->t_len; } if(file) fclose(file); return &scan->xs_ctup; }
/* * ExecRefreshMatView -- execute a REFRESH MATERIALIZED VIEW command * * This refreshes the materialized view by creating a new table and swapping * the relfilenodes of the new table and the old materialized view, so the OID * of the original materialized view is preserved. Thus we do not lose GRANT * nor references to this materialized view. * * If WITH NO DATA was specified, this is effectively like a TRUNCATE; * otherwise it is like a TRUNCATE followed by an INSERT using the SELECT * statement associated with the materialized view. The statement node's * skipData field shows whether the clause was used. * * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading * the new heap, it's better to create the indexes afterwards than to fill them * incrementally while we load. * * The matview's "populated" state is changed based on whether the contents * reflect the result set of the materialized view's query. */ void ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, ParamListInfo params, char *completionTag) { Oid matviewOid; Relation matviewRel; RewriteRule *rule; List *actions; Query *dataQuery; Oid tableSpace; Oid owner; Oid OIDNewHeap; DestReceiver *dest; bool concurrent; LOCKMODE lockmode; /* Determine strength of lock needed. */ concurrent = stmt->concurrent; lockmode = concurrent ? ExclusiveLock : AccessExclusiveLock; /* * Get a lock until end of transaction. */ matviewOid = RangeVarGetRelidExtended(stmt->relation, lockmode, false, false, RangeVarCallbackOwnsTable, NULL); matviewRel = heap_open(matviewOid, NoLock); /* Make sure it is a materialized view. */ if (matviewRel->rd_rel->relkind != RELKIND_MATVIEW) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a materialized view", RelationGetRelationName(matviewRel)))); /* Check that CONCURRENTLY is not specified if not populated. */ if (concurrent && !RelationIsPopulated(matviewRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("CONCURRENTLY cannot be used when the materialized view is not populated"))); /* Check that conflicting options have not been specified. */ if (concurrent && stmt->skipData) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("CONCURRENTLY and WITH NO DATA options cannot be used together"))); /* We're not using materialized views in the system catalogs. */ Assert(!IsSystemRelation(matviewRel)); /* We don't allow an oid column for a materialized view. */ Assert(!matviewRel->rd_rel->relhasoids); /* * Check that everything is correct for a refresh. Problems at this point * are internal errors, so elog is sufficient. */ if (matviewRel->rd_rel->relhasrules == false || matviewRel->rd_rules->numLocks < 1) elog(ERROR, "materialized view \"%s\" is missing rewrite information", RelationGetRelationName(matviewRel)); if (matviewRel->rd_rules->numLocks > 1) elog(ERROR, "materialized view \"%s\" has too many rules", RelationGetRelationName(matviewRel)); rule = matviewRel->rd_rules->rules[0]; if (rule->event != CMD_SELECT || !(rule->isInstead)) elog(ERROR, "the rule for materialized view \"%s\" is not a SELECT INSTEAD OF rule", RelationGetRelationName(matviewRel)); actions = rule->actions; if (list_length(actions) != 1) elog(ERROR, "the rule for materialized view \"%s\" is not a single action", RelationGetRelationName(matviewRel)); /* * The stored query was rewritten at the time of the MV definition, but * has not been scribbled on by the planner. */ dataQuery = (Query *) linitial(actions); Assert(IsA(dataQuery, Query)); /* * Check for active uses of the relation in the current transaction, such * as open scans. * * NB: We count on this to protect us against problems with refreshing the * data using HEAP_INSERT_FROZEN. */ CheckTableNotInUse(matviewRel, "REFRESH MATERIALIZED VIEW"); /* * Tentatively mark the matview as populated or not (this will roll back * if we fail later). */ SetMatViewPopulatedState(matviewRel, !stmt->skipData); /* Concurrent refresh builds new data in temp tablespace, and does diff. */ if (concurrent) tableSpace = GetDefaultTablespace(RELPERSISTENCE_TEMP); else tableSpace = matviewRel->rd_rel->reltablespace; owner = matviewRel->rd_rel->relowner; heap_close(matviewRel, NoLock); /* Create the transient table that will receive the regenerated data. */ OIDNewHeap = make_new_heap(matviewOid, tableSpace, concurrent, ExclusiveLock); dest = CreateTransientRelDestReceiver(OIDNewHeap); /* Generate the data, if wanted. */ if (!stmt->skipData) refresh_matview_datafill(dest, dataQuery, queryString, owner); /* Make the matview match the newly generated data. */ if (concurrent) { int old_depth = matview_maintenance_depth; PG_TRY(); { refresh_by_match_merge(matviewOid, OIDNewHeap); } PG_CATCH(); { matview_maintenance_depth = old_depth; PG_RE_THROW(); } PG_END_TRY(); Assert(matview_maintenance_depth == old_depth); } else refresh_by_heap_swap(matviewOid, OIDNewHeap); }
/* * Delete a single constraint record. */ void RemoveConstraintById(Oid conId) { Relation conDesc; HeapTuple tup; Form_pg_constraint con; conDesc = heap_open(ConstraintRelationId, RowExclusiveLock); tup = SearchSysCache1(CONSTROID, ObjectIdGetDatum(conId)); if (!HeapTupleIsValid(tup)) /* should not happen */ elog(ERROR, "cache lookup failed for constraint %u", conId); con = (Form_pg_constraint) GETSTRUCT(tup); /* * Special processing depending on what the constraint is for. */ if (OidIsValid(con->conrelid)) { Relation rel; /* * If the constraint is for a relation, open and exclusive-lock the * relation it's for. */ rel = heap_open(con->conrelid, AccessExclusiveLock); /* * We need to update the relcheck count if it is a check constraint * being dropped. This update will force backends to rebuild relcache * entries when we commit. */ if (con->contype == CONSTRAINT_CHECK) { Relation pgrel; HeapTuple relTup; Form_pg_class classForm; pgrel = heap_open(RelationRelationId, RowExclusiveLock); relTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(con->conrelid)); if (!HeapTupleIsValid(relTup)) elog(ERROR, "cache lookup failed for relation %u", con->conrelid); classForm = (Form_pg_class) GETSTRUCT(relTup); if (classForm->relchecks == 0) /* should not happen */ elog(ERROR, "relation \"%s\" has relchecks = 0", RelationGetRelationName(rel)); classForm->relchecks--; simple_heap_update(pgrel, &relTup->t_self, relTup); CatalogUpdateIndexes(pgrel, relTup); heap_freetuple(relTup); heap_close(pgrel, RowExclusiveLock); } /* Keep lock on constraint's rel until end of xact */ heap_close(rel, NoLock); } else if (OidIsValid(con->contypid)) { /* * XXX for now, do nothing special when dropping a domain constraint * * Probably there should be some form of locking on the domain type, * but we have no such concept at the moment. */ } else elog(ERROR, "constraint %u is not of a known type", conId); /* Fry the constraint itself */ simple_heap_delete(conDesc, &tup->t_self); /* Clean up */ ReleaseSysCache(tup); heap_close(conDesc, RowExclusiveLock); }
/* * refresh_by_match_merge * * Refresh a materialized view with transactional semantics, while allowing * concurrent reads. * * This is called after a new version of the data has been created in a * temporary table. It performs a full outer join against the old version of * the data, producing "diff" results. This join cannot work if there are any * duplicated rows in either the old or new versions, in the sense that every * column would compare as equal between the two rows. It does work correctly * in the face of rows which have at least one NULL value, with all non-NULL * columns equal. The behavior of NULLs on equality tests and on UNIQUE * indexes turns out to be quite convenient here; the tests we need to make * are consistent with default behavior. If there is at least one UNIQUE * index on the materialized view, we have exactly the guarantee we need. By * joining based on equality on all columns which are part of any unique * index, we identify the rows on which we can use UPDATE without any problem. * If any column is NULL in either the old or new version of a row (or both), * we must use DELETE and INSERT, since there could be multiple rows which are * NOT DISTINCT FROM each other, and we could otherwise end up with the wrong * number of occurrences in the updated relation. The temporary table used to * hold the diff results contains just the TID of the old record (if matched) * and the ROW from the new table as a single column of complex record type * (if matched). * * Once we have the diff table, we perform set-based DELETE, UPDATE, and * INSERT operations against the materialized view, and discard both temporary * tables. * * Everything from the generation of the new data to applying the differences * takes place under cover of an ExclusiveLock, since it seems as though we * would want to prohibit not only concurrent REFRESH operations, but also * incremental maintenance. It also doesn't seem reasonable or safe to allow * SELECT FOR UPDATE or SELECT FOR SHARE on rows being updated or deleted by * this command. */ static void refresh_by_match_merge(Oid matviewOid, Oid tempOid) { StringInfoData querybuf; Relation matviewRel; Relation tempRel; char *matviewname; char *tempname; char *diffname; TupleDesc tupdesc; bool foundUniqueIndex; List *indexoidlist; ListCell *indexoidscan; int16 relnatts; bool *usedForQual; Oid save_userid; int save_sec_context; int save_nestlevel; initStringInfo(&querybuf); matviewRel = heap_open(matviewOid, NoLock); matviewname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(matviewRel)), RelationGetRelationName(matviewRel)); tempRel = heap_open(tempOid, NoLock); tempname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(tempRel)), RelationGetRelationName(tempRel)); diffname = make_temptable_name_n(tempname, 2); relnatts = matviewRel->rd_rel->relnatts; usedForQual = (bool *) palloc0(sizeof(bool) * relnatts); /* Open SPI context. */ if (SPI_connect() != SPI_OK_CONNECT) elog(ERROR, "SPI_connect failed"); /* Analyze the temp table with the new contents. */ appendStringInfo(&querybuf, "ANALYZE %s", tempname); if (SPI_exec(querybuf.data, 0) != SPI_OK_UTILITY) elog(ERROR, "SPI_exec failed: %s", querybuf.data); /* * We need to ensure that there are not duplicate rows without NULLs in * the new data set before we can count on the "diff" results. Check for * that in a way that allows showing the first duplicated row found. Even * after we pass this test, a unique index on the materialized view may * find a duplicate key problem. */ resetStringInfo(&querybuf); appendStringInfo(&querybuf, "SELECT x FROM %s x WHERE x IS NOT NULL AND EXISTS " "(SELECT * FROM %s y WHERE y IS NOT NULL " "AND (y.*) = (x.*) AND y.ctid <> x.ctid) LIMIT 1", tempname, tempname); if (SPI_execute(querybuf.data, false, 1) != SPI_OK_SELECT) elog(ERROR, "SPI_exec failed: %s", querybuf.data); if (SPI_processed > 0) { ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), errmsg("new data for \"%s\" contains duplicate rows without any NULL columns", RelationGetRelationName(matviewRel)), errdetail("Row: %s", SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1)))); } /* Start building the query for creating the diff table. */ resetStringInfo(&querybuf); appendStringInfo(&querybuf, "CREATE TEMP TABLE %s AS " "SELECT x.ctid AS tid, y FROM %s x FULL JOIN %s y ON (", diffname, matviewname, tempname); /* * Get the list of index OIDs for the table from the relcache, and look up * each one in the pg_index syscache. We will test for equality on all * columns present in all unique indexes which only reference columns and * include all rows. */ tupdesc = matviewRel->rd_att; foundUniqueIndex = false; indexoidlist = RelationGetIndexList(matviewRel); foreach(indexoidscan, indexoidlist) { Oid indexoid = lfirst_oid(indexoidscan); HeapTuple indexTuple; Form_pg_index index; indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexoid)); if (!HeapTupleIsValid(indexTuple)) /* should not happen */ elog(ERROR, "cache lookup failed for index %u", indexoid); index = (Form_pg_index) GETSTRUCT(indexTuple); /* We're only interested if it is unique and valid. */ if (index->indisunique && IndexIsValid(index)) { int numatts = index->indnatts; int i; bool expr = false; Relation indexRel; /* Skip any index on an expression. */ for (i = 0; i < numatts; i++) { if (index->indkey.values[i] == 0) { expr = true; break; } } if (expr) { ReleaseSysCache(indexTuple); continue; } /* Skip partial indexes. */ indexRel = index_open(index->indexrelid, RowExclusiveLock); if (RelationGetIndexPredicate(indexRel) != NIL) { index_close(indexRel, NoLock); ReleaseSysCache(indexTuple); continue; } /* Hold the locks, since we're about to run DML which needs them. */ index_close(indexRel, NoLock); /* Add quals for all columns from this index. */ for (i = 0; i < numatts; i++) { int attnum = index->indkey.values[i]; Oid type; Oid op; const char *colname; /* * Only include the column once regardless of how many times * it shows up in how many indexes. * * This is also useful later to omit columns which can not * have changed from the SET clause of the UPDATE statement. */ if (usedForQual[attnum - 1]) continue; usedForQual[attnum - 1] = true; /* * Actually add the qual, ANDed with any others. */ if (foundUniqueIndex) appendStringInfoString(&querybuf, " AND "); colname = quote_identifier(NameStr((tupdesc->attrs[attnum - 1])->attname)); appendStringInfo(&querybuf, "y.%s ", colname); type = attnumTypeId(matviewRel, attnum); op = lookup_type_cache(type, TYPECACHE_EQ_OPR)->eq_opr; mv_GenerateOper(&querybuf, op); appendStringInfo(&querybuf, " x.%s", colname); foundUniqueIndex = true; } } ReleaseSysCache(indexTuple); }
/* * Return a pinned and exclusively locked buffer which can be used to insert an * index item of size itemsz. If oldbuf is a valid buffer, it is also locked * (in a order determined to avoid deadlocks.) * * If there's no existing page with enough free space to accomodate the new * item, the relation is extended. If this happens, *extended is set to true. * * If we find that the old page is no longer a regular index page (because * of a revmap extension), the old buffer is unlocked and we return * InvalidBuffer. */ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, bool *was_extended) { BlockNumber oldblk; BlockNumber newblk; Page page; int freespace; if (BufferIsValid(oldbuf)) oldblk = BufferGetBlockNumber(oldbuf); else oldblk = InvalidBlockNumber; /* * Loop until we find a page with sufficient free space. By the time we * return to caller out of this loop, both buffers are valid and locked; * if we have to restart here, neither buffer is locked and buf is not a * pinned buffer. */ newblk = RelationGetTargetBlock(irel); if (newblk == InvalidBlockNumber) newblk = GetPageWithFreeSpace(irel, itemsz); for (;;) { Buffer buf; bool extensionLockHeld = false; bool extended = false; CHECK_FOR_INTERRUPTS(); if (newblk == InvalidBlockNumber) { /* * There's not enough free space in any existing index page, * according to the FSM: extend the relation to obtain a shiny new * page. */ if (!RELATION_IS_LOCAL(irel)) { LockRelationForExtension(irel, ExclusiveLock); extensionLockHeld = true; } buf = ReadBuffer(irel, P_NEW); newblk = BufferGetBlockNumber(buf); *was_extended = extended = true; BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", BufferGetBlockNumber(buf))); } else if (newblk == oldblk) { /* * There's an odd corner-case here where the FSM is out-of-date, * and gave us the old page. */ buf = oldbuf; } else { buf = ReadBuffer(irel, newblk); } /* * We lock the old buffer first, if it's earlier than the new one; but * before we do, we need to check that it hasn't been turned into a * revmap page concurrently; if we detect that it happened, give up * and tell caller to start over. */ if (BufferIsValid(oldbuf) && oldblk < newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); return InvalidBuffer; } } LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); page = BufferGetPage(buf); if (extended) brin_page_init(page, BRIN_PAGETYPE_REGULAR); /* * We have a new buffer to insert into. Check that the new page has * enough free space, and return it if it does; otherwise start over. * Note that we allow for the FSM to be out of date here, and in that * case we update it and move on. * * (br_page_get_freespace also checks that the FSM didn't hand us a * page that has since been repurposed for the revmap.) */ freespace = br_page_get_freespace(page); if (freespace >= itemsz) { RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); /* * Since the target block specification can get lost on cache * invalidations, make sure we update the more permanent FSM with * data about it before going away. */ if (extended) RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), freespace); /* * Lock the old buffer if not locked already. Note that in this * case we know for sure it's a regular page: it's later than the * new page we just got, which is not a revmap page, and revmap * pages are always consecutive. */ if (BufferIsValid(oldbuf) && oldblk > newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); } return buf; } /* This page is no good. */ /* * If an entirely new page does not contain enough free space for the * new item, then surely that item is oversized. Complain loudly; but * first make sure we record the page as free, for next time. */ if (extended) { RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), freespace); ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) freespace, RelationGetRelationName(irel)))); return InvalidBuffer; /* keep compiler quiet */ } if (newblk != oldblk) UnlockReleaseBuffer(buf); if (BufferIsValid(oldbuf) && oldblk <= newblk) LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); } }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples after crash recovery. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->tuples_removed += 1; } else stats->num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
Datum pgrowlocks(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; HeapScanDesc scan; HeapTuple tuple; TupleDesc tupdesc; AttInMetadata *attinmeta; Datum result; MyData *mydata; Relation rel; if (SRF_IS_FIRSTCALL()) { text *relname; RangeVar *relrv; MemoryContext oldcontext; AclResult aclresult; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); attinmeta = TupleDescGetAttInMetadata(tupdesc); funcctx->attinmeta = attinmeta; relname = PG_GETARG_TEXT_P(0); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = heap_openrv(relrv, AccessShareLock); /* check permissions: must have SELECT on table */ aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, RelationGetRelationName(rel)); scan = heap_beginscan(rel, GetActiveSnapshot(), 0, NULL); mydata = palloc(sizeof(*mydata)); mydata->rel = rel; mydata->scan = scan; mydata->ncolumns = tupdesc->natts; funcctx->user_fctx = mydata; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); attinmeta = funcctx->attinmeta; mydata = (MyData *) funcctx->user_fctx; scan = mydata->scan; /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { HTSU_Result htsu; TransactionId xmax; uint16 infomask; /* must hold a buffer lock to call HeapTupleSatisfiesUpdate */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); htsu = HeapTupleSatisfiesUpdate(tuple, GetCurrentCommandId(false), scan->rs_cbuf); xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); infomask = tuple->t_data->t_infomask; /* * a tuple is locked if HTSU returns BeingUpdated, and if it returns * MayBeUpdated but the Xmax is valid and pointing at us. */ if (htsu == HeapTupleBeingUpdated || (htsu == HeapTupleMayBeUpdated && !(infomask & HEAP_XMAX_INVALID) && !(infomask & HEAP_XMAX_IS_MULTI) && (xmax == GetCurrentTransactionIdIfAny()))) { char **values; values = (char **) palloc(mydata->ncolumns * sizeof(char *)); values[Atnum_tid] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self)); values[Atnum_xmax] = palloc(NCHARS * sizeof(char)); snprintf(values[Atnum_xmax], NCHARS, "%d", xmax); if (infomask & HEAP_XMAX_IS_MULTI) { MultiXactMember *members; int nmembers; bool first = true; bool allow_old; values[Atnum_ismulti] = pstrdup("true"); allow_old = !(infomask & HEAP_LOCK_MASK) && (infomask & HEAP_XMAX_LOCK_ONLY); nmembers = GetMultiXactIdMembers(xmax, &members, allow_old, false); if (nmembers == -1) { values[Atnum_xids] = "{0}"; values[Atnum_modes] = "{transient upgrade status}"; values[Atnum_pids] = "{0}"; } else { int j; values[Atnum_xids] = palloc(NCHARS * nmembers); values[Atnum_modes] = palloc(NCHARS * nmembers); values[Atnum_pids] = palloc(NCHARS * nmembers); strcpy(values[Atnum_xids], "{"); strcpy(values[Atnum_modes], "{"); strcpy(values[Atnum_pids], "{"); for (j = 0; j < nmembers; j++) { char buf[NCHARS]; if (!first) { strcat(values[Atnum_xids], ","); strcat(values[Atnum_modes], ","); strcat(values[Atnum_pids], ","); } snprintf(buf, NCHARS, "%d", members[j].xid); strcat(values[Atnum_xids], buf); switch (members[j].status) { case MultiXactStatusUpdate: snprintf(buf, NCHARS, "Update"); break; case MultiXactStatusNoKeyUpdate: snprintf(buf, NCHARS, "No Key Update"); break; case MultiXactStatusForUpdate: snprintf(buf, NCHARS, "For Update"); break; case MultiXactStatusForNoKeyUpdate: snprintf(buf, NCHARS, "For No Key Update"); break; case MultiXactStatusForShare: snprintf(buf, NCHARS, "Share"); break; case MultiXactStatusForKeyShare: snprintf(buf, NCHARS, "Key Share"); break; } strcat(values[Atnum_modes], buf); snprintf(buf, NCHARS, "%d", BackendXidGetPid(members[j].xid)); strcat(values[Atnum_pids], buf); first = false; } strcat(values[Atnum_xids], "}"); strcat(values[Atnum_modes], "}"); strcat(values[Atnum_pids], "}"); } } else { values[Atnum_ismulti] = pstrdup("false"); values[Atnum_xids] = palloc(NCHARS * sizeof(char)); snprintf(values[Atnum_xids], NCHARS, "{%d}", xmax); values[Atnum_modes] = palloc(NCHARS); if (infomask & HEAP_XMAX_LOCK_ONLY) { if (HEAP_XMAX_IS_SHR_LOCKED(infomask)) snprintf(values[Atnum_modes], NCHARS, "{For Share}"); else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) snprintf(values[Atnum_modes], NCHARS, "{For Key Share}"); else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) snprintf(values[Atnum_modes], NCHARS, "{For Update}"); else snprintf(values[Atnum_modes], NCHARS, "{For No Key Update}"); } else /* neither keyshare nor exclusive bit it set */ snprintf(values[Atnum_modes], NCHARS, "{transient upgrade status}"); } else { if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) snprintf(values[Atnum_modes], NCHARS, "{Update}"); else snprintf(values[Atnum_modes], NCHARS, "{No Key Update}"); } values[Atnum_pids] = palloc(NCHARS * sizeof(char)); snprintf(values[Atnum_pids], NCHARS, "{%d}", BackendXidGetPid(xmax)); } LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* build a tuple */ tuple = BuildTupleFromCStrings(attinmeta, values); /* make the tuple into a datum */ result = HeapTupleGetDatum(tuple); /* * no need to pfree what we allocated; it's on a short-lived * memory context anyway */ SRF_RETURN_NEXT(funcctx, result); } else { LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); } } heap_endscan(scan); heap_close(mydata->rel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* * Update the eof and filetupcount of a parquet table. */ void UpdateParquetFileSegInfo(Relation parentrel, AppendOnlyEntry *aoEntry, int segno, int64 eof, int64 eof_uncompressed, int64 tuples_added) { LockAcquireResult acquireResult; Relation pg_parquetseg_rel; TupleDesc pg_parquetseg_dsc; ScanKeyData key[1]; SysScanDesc parquetscan; HeapTuple tuple, new_tuple; Datum filetupcount; Datum new_tuple_count; Datum *new_record; bool *new_record_nulls; bool *new_record_repl; bool isNull; /* overflow sanity checks. don't check the same for tuples_added, * it may be coming as a negative diff from gp_update_ao_master_stats */ Assert(eof >= 0); Insist(Gp_role != GP_ROLE_EXECUTE); elog(DEBUG3, "UpdateParquetFileSegInfo called. segno = %d", segno); if (Gp_role != GP_ROLE_DISPATCH) { /* * Verify we already have the write-lock! */ acquireResult = LockRelationAppendOnlySegmentFile( &parentrel->rd_node, segno, AccessExclusiveLock, /* dontWait */ false); if (acquireResult != LOCKACQUIRE_ALREADY_HELD) { elog(ERROR, "Should already have the (transaction-scope) write-lock on Parquet segment file #%d, " "relation %s", segno, RelationGetRelationName(parentrel)); } } /* * Open the aoseg relation and its index. */ pg_parquetseg_rel = heap_open(aoEntry->segrelid, RowExclusiveLock); pg_parquetseg_dsc = pg_parquetseg_rel->rd_att; /* * Setup a scan key to fetch from the index by segno. */ ScanKeyInit(&key[0], (AttrNumber) Anum_pg_parquetseg_segno, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); parquetscan = systable_beginscan(pg_parquetseg_rel, aoEntry->segidxid, TRUE, SnapshotNow, 1, &key[0]); tuple = systable_getnext(parquetscan); if (!HeapTupleIsValid(tuple)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("parquet table \"%s\" file segment \"%d\" entry " "does not exist", RelationGetRelationName(parentrel), segno))); new_record = palloc0(sizeof(Datum) * pg_parquetseg_dsc->natts); new_record_nulls = palloc0(sizeof(bool) * pg_parquetseg_dsc->natts); new_record_repl = palloc0(sizeof(bool) * pg_parquetseg_dsc->natts); /* get the current tuple count so we can add to it */ filetupcount = fastgetattr(tuple, Anum_pg_parquetseg_tupcount, pg_parquetseg_dsc, &isNull); if(isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid pg_aoseg filetupcount value: NULL"))); /* calculate the new tuple count */ new_tuple_count = DirectFunctionCall2(float8pl, filetupcount, Float8GetDatum((float8)tuples_added)); /* * Build a tuple to update */ new_record[Anum_pg_parquetseg_eof - 1] = Float8GetDatum((float8)eof); new_record_repl[Anum_pg_parquetseg_eof - 1] = true; new_record[Anum_pg_parquetseg_tupcount - 1] = new_tuple_count; new_record_repl[Anum_pg_parquetseg_tupcount - 1] = true; new_record[Anum_pg_parquetseg_eofuncompressed - 1] = Float8GetDatum((float8)eof_uncompressed); new_record_repl[Anum_pg_parquetseg_eofuncompressed - 1] = true; /* * update the tuple in the pg_aoseg table */ new_tuple = heap_modify_tuple(tuple, pg_parquetseg_dsc, new_record, new_record_nulls, new_record_repl); simple_heap_update(pg_parquetseg_rel, &tuple->t_self, new_tuple); CatalogUpdateIndexes(pg_parquetseg_rel, new_tuple); heap_freetuple(new_tuple); /* Finish up scan */ systable_endscan(parquetscan); heap_close(pg_parquetseg_rel, RowExclusiveLock); pfree(new_record); pfree(new_record_nulls); pfree(new_record_repl); }
/* * VACUUM cleanup: update FSM */ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) PG_RETURN_POINTER(stats); /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* use heap's tuple count */ stats->std.num_index_tuples = info->num_heap_tuples; stats->std.estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } if (stats->needReindex) ereport(NOTICE, (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", RelationGetRelationName(rel)))); /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } lastBlock = npages - 1; /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->std.num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * GetFileSegInfo * * Get the catalog entry for an appendonly (row-oriented) relation from the * pg_aoseg_* relation that belongs to the currently used * AppendOnly table. * * If a caller intends to append to this file segment entry they must * already hold a relation Append-Only segment file (transaction-scope) lock (tag * LOCKTAG_RELATION_APPENDONLY_SEGMENT_FILE) in order to guarantee * stability of the pg_aoseg information on this segment file and exclusive right * to append data to the segment file. */ ParquetFileSegInfo * GetParquetFileSegInfo(Relation parentrel, AppendOnlyEntry *aoEntry, Snapshot parquetMetaDataSnapshot, int segno) { Relation pg_parquetseg_rel; TupleDesc pg_parquetseg_dsc; HeapTuple tuple; ScanKeyData key[1]; SysScanDesc parquetscan; Datum eof, eof_uncompressed, tupcount; bool isNull; bool indexOK; Oid indexid; ParquetFileSegInfo *fsinfo; /* * Check the pg_paqseg relation to be certain the parquet table segment file * is there. */ pg_parquetseg_rel = heap_open(aoEntry->segrelid, AccessShareLock); pg_parquetseg_dsc = RelationGetDescr(pg_parquetseg_rel); if (Gp_role == GP_ROLE_EXECUTE) { indexOK = FALSE; indexid = InvalidOid; } else { indexOK = TRUE; indexid = aoEntry->segidxid; } /* * Setup a scan key to fetch from the index by segno. */ ScanKeyInit(&key[0], (AttrNumber) Anum_pg_parquetseg_segno, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); parquetscan = systable_beginscan(pg_parquetseg_rel, indexid, indexOK, SnapshotNow, 1, &key[0]); tuple = systable_getnext(parquetscan); if (!HeapTupleIsValid(tuple)) { /* This segment file does not have an entry. */ systable_endscan(parquetscan); heap_close(pg_parquetseg_rel, AccessShareLock); return NULL ; } tuple = heap_copytuple(tuple); systable_endscan(parquetscan); Assert(HeapTupleIsValid(tuple)); fsinfo = (ParquetFileSegInfo *) palloc0(sizeof(ParquetFileSegInfo)); /* get the eof */ eof = fastgetattr(tuple, Anum_pg_parquetseg_eof, pg_parquetseg_dsc, &isNull); if (isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid eof value: NULL"))); /* get the tupcount */ tupcount = fastgetattr(tuple, Anum_pg_parquetseg_tupcount, pg_parquetseg_dsc, &isNull); if (isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid tupcount value: NULL"))); /* get the uncompressed eof */ eof_uncompressed = fastgetattr(tuple, Anum_pg_parquetseg_eofuncompressed, pg_parquetseg_dsc, &isNull); /* * Confusing: This eof_uncompressed variable is never used. It appears we only * call fastgetattr to get the isNull value. this variable "eof_uncompressed" is * not at all the same as fsinfo->eof_uncompressed. */ if (isNull) { /* * NULL is allowed. Tables that were created before the release of the * eof_uncompressed catalog column will have a NULL instead of a value. */ fsinfo->eof_uncompressed = InvalidUncompressedEof; } else { fsinfo->eof_uncompressed = (int64) DatumGetFloat8(eof_uncompressed); } fsinfo->segno = segno; fsinfo->eof = (int64) DatumGetFloat8(eof); fsinfo->tupcount = (int64) DatumGetFloat8(tupcount); ItemPointerSetInvalid(&fsinfo->sequence_tid); if (fsinfo->eof < 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("invalid eof " INT64_FORMAT " for relation %s", fsinfo->eof, RelationGetRelationName(parentrel)))); /* Finish up scan and close appendonly catalog. */ heap_close(pg_parquetseg_rel, AccessShareLock); return fsinfo; }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; instr_time starttime; /* Initialize the starttime if we check for conflicting lock requests */ INSTR_TIME_SET_CURRENT(starttime); /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * Check if another process requests a lock on our relation. We are * holding an AccessExclusiveLock here, so they will be waiting. We * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we * only check if that interval has elapsed once every 32 blocks to * keep the number of system calls and actual shared lock table * lookups to a minimum. */ if ((blkno % 32) == 0) { instr_time currenttime; instr_time elapsed; INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) { ereport(elevel, (errmsg("\"%s\": suspending truncate due to conflicting lock request", RelationGetRelationName(onerel)))); vacrelstats->lock_waiter_detected = true; return blkno; } starttime = currenttime; } } /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* * btbuild() -- build a new btree index. */ Datum btbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; BTBuildState buildstate; buildstate.isUnique = indexInfo->ii_Unique; buildstate.haveDead = false; buildstate.heapRel = heap; buildstate.spool = NULL; buildstate.spool2 = NULL; buildstate.indtuples = 0; #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) ResetUsage(); #endif /* BTREE_BUILD_STATS */ /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false); /* * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. */ if (indexInfo->ii_Unique) buildstate.spool2 = _bt_spoolinit(index, false, true); /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, btbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ if (buildstate.spool2 && !buildstate.haveDead) { /* spool2 turns out to be unnecessary */ _bt_spooldestroy(buildstate.spool2); buildstate.spool2 = NULL; } /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. */ _bt_leafbuild(buildstate.spool, buildstate.spool2); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) { ShowUsage("BTREE BUILD STATS"); ResetUsage(); } #endif /* BTREE_BUILD_STATS */ /* * If we are reindexing a pre-existing index, it is critical to send out a * relcache invalidation SI message to ensure all backends re-read the * index metapage. We expect that the caller will ensure that happens * (typically as a side effect of updating index stats, but it must happen * even if the stats don't change!) */ /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }