/* * Create a table space * * Only superusers can create a tablespace. This seems a reasonable restriction * since we're determining the system layout and, anyway, we probably have * root if we're doing this kind of activity */ void CreateTableSpace(CreateTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK Relation rel; Datum values[Natts_pg_tablespace]; char nulls[Natts_pg_tablespace]; HeapTuple tuple; Oid tablespaceoid; char *location; char *linkloc; Oid ownerId; /* validate */ /* don't call this in a transaction block */ PreventTransactionChain((void *) stmt, "CREATE TABLESPACE"); /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create tablespace \"%s\"", stmt->tablespacename), errhint("Must be superuser to create a tablespace."))); /* However, the eventual owner of the tablespace need not be */ if (stmt->owner) ownerId = get_roleid_checked(stmt->owner); else ownerId = GetUserId(); /* Unix-ify the offered path, and strip any trailing slashes */ location = pstrdup(stmt->location); canonicalize_path(location); /* disallow quotes, else CREATE DATABASE would be at risk */ if (strchr(location, '\'')) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("tablespace location may not contain single quotes"))); /* * Allowing relative paths seems risky * * this also helps us ensure that location is not empty or whitespace */ if (!is_absolute_path(location)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location must be an absolute path"))); /* * Check that location isn't too long. Remember that we're going to append * '/<dboid>/<relid>.<nnn>' (XXX but do we ever form the whole path * explicitly? This may be overly conservative.) */ if (strlen(location) >= (MAXPGPATH - 1 - 10 - 1 - 10 - 1 - 10)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", location))); /* * Disallow creation of tablespaces named "pg_xxx"; we reserve this * namespace for system purposes. */ if (!allowSystemTableMods && IsReservedName(stmt->tablespacename)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable tablespace name \"%s\"", stmt->tablespacename), errdetail("The prefix \"pg_\" is reserved for system tablespaces."))); /* * Check that there is no other tablespace by this name. (The unique * index would catch this anyway, but might as well give a friendlier * message.) */ if (OidIsValid(get_tablespace_oid(stmt->tablespacename))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("tablespace \"%s\" already exists", stmt->tablespacename))); /* * Insert tuple into pg_tablespace. The purpose of doing this first is to * lock the proposed tablename against other would-be creators. The * insertion will roll back if we find problems below. */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); MemSet(nulls, ' ', Natts_pg_tablespace); values[Anum_pg_tablespace_spcname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->tablespacename)); values[Anum_pg_tablespace_spcowner - 1] = ObjectIdGetDatum(ownerId); values[Anum_pg_tablespace_spclocation - 1] = DirectFunctionCall1(textin, CStringGetDatum(location)); nulls[Anum_pg_tablespace_spcacl - 1] = 'n'; tuple = heap_formtuple(rel->rd_att, values, nulls); tablespaceoid = simple_heap_insert(rel, tuple); CatalogUpdateIndexes(rel, tuple); heap_freetuple(tuple); /* Record dependency on owner */ recordDependencyOnOwner(TableSpaceRelationId, tablespaceoid, ownerId); /* * Attempt to coerce target directory to safe permissions. If this fails, * it doesn't exist or has the wrong owner. */ if (chmod(location, 0700) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not set permissions on directory \"%s\": %m", location))); /* * Check the target directory is empty. */ if (!directory_is_empty(location)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("directory \"%s\" is not empty", location))); /* * Create the PG_VERSION file in the target directory. This has several * purposes: to make sure we can write in the directory, to prevent * someone from creating another tablespace pointing at the same directory * (the emptiness check above will fail), and to label tablespace * directories by PG version. */ set_short_version(location); /* * All seems well, create the symlink */ linkloc = (char *) palloc(10 + 10 + 1); sprintf(linkloc, "pg_tblspc/%u", tablespaceoid); if (symlink(location, linkloc) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create symbolic link \"%s\": %m", linkloc))); /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; XLogRecData rdata[2]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = offsetof(xl_tblspc_create_rec, ts_path); rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) location; rdata[1].len = strlen(location) + 1; rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata); } pfree(linkloc); pfree(location); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
/* * Build a pending-list page from the given array of tuples, and write it out. * * Returns amount of free space left on the page. */ static int32 writeListPage(Relation index, Buffer buffer, IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) { Page page = BufferGetPage(buffer); int32 i, freesize, size = 0; OffsetNumber l, off; char *workspace; char *ptr; /* workspace could be a local array; we use palloc for alignment */ workspace = palloc(BLCKSZ); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_LIST); off = FirstOffsetNumber; ptr = workspace; for (i = 0; i < ntuples; i++) { int this_size = IndexTupleSize(tuples[i]); memcpy(ptr, tuples[i], this_size); ptr += this_size; size += this_size; l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); off++; } Assert(size <= BLCKSZ); /* else we overran workspace */ GinPageGetOpaque(page)->rightlink = rightlink; /* * tail page may contain only whole row(s) or final part of row placed on * previous pages (a "row" here meaning all the index tuples generated for * one heap tuple) */ if (rightlink == InvalidBlockNumber) { GinPageSetFullRow(page); GinPageGetOpaque(page)->maxoff = 1; } else { GinPageGetOpaque(page)->maxoff = 0; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecData rdata[2]; ginxlogInsertListPage data; XLogRecPtr recptr; data.node = index->rd_node; data.blkno = BufferGetBlockNumber(buffer); data.rightlink = rightlink; data.ntuples = ntuples; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogInsertListPage); rdata[0].next = rdata + 1; rdata[1].buffer = InvalidBuffer; rdata[1].data = workspace; rdata[1].len = size; rdata[1].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata); PageSetLSN(page, recptr); } /* get free space before releasing buffer */ freesize = PageGetExactFreeSpace(page); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); pfree(workspace); return freesize; }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (!index->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * Vacuum a root page when it is also a leaf * * On the root, we just delete any dead leaf tuples; no fancy business */ static void vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); spgxlogVacuumRoot xlrec; OffsetNumber toDelete[MaxIndexTuplesPerPage]; OffsetNumber i, max = PageGetMaxOffsetNumber(page); xlrec.nDelete = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; xlrec.nDelete++; } else { bds->stats->num_index_tuples += 1; } } else { /* all tuples on root should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } } if (xlrec.nDelete == 0) return; /* nothing more to do */ /* Do the update */ START_CRIT_SECTION(); /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ PageIndexMultiDelete(page, toDelete, xlrec.nDelete); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); /* Prepare WAL record */ STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDelete, sizeof(OffsetNumber) * xlrec.nDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); return false; }
/* * Insert a new item to a page. * * Returns true if the insertion was finished. On false, the page was split and * the parent needs to be updated. (A root split returns true as it doesn't * need any further action by the caller to complete.) * * When inserting a downlink to an internal page, 'childbuf' contains the * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared * atomically with the insert. Also, the existing item at offset stack->off * in the target page is updated to point to updateblkno. * * stack->buffer is locked on entry, and is kept locked. * Likewise for childbuf, if given. */ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); bool result; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; Page newlpage = NULL, newrpage = NULL; void *ptp_workspace = NULL; XLogRecData payloadrdata[10]; MemoryContext tmpCxt; MemoryContext oldCxt; /* * We do all the work of this function and its subfunctions in a temporary * memory context. This avoids leakages and simplifies APIs, since some * subfunctions allocate storage that has to survive until we've finished * the WAL insertion. */ tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "ginPlaceToPage temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; if (GinPageIsLeaf(page)) { xlflags |= GIN_INSERT_ISLEAF; Assert(!BufferIsValid(childbuf)); Assert(updateblkno == InvalidBlockNumber); } else { Assert(BufferIsValid(childbuf)); Assert(updateblkno != InvalidBlockNumber); childpage = BufferGetPage(childbuf); } /* * See if the incoming tuple will fit on the page. beginPlaceToPage will * decide if the page needs to be split, and will compute the split * contents if so. See comments for beginPlaceToPage and execPlaceToPage * functions for more details of the API here. */ rc = btree->beginPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, &ptp_workspace, &newlpage, &newrpage, payloadrdata); if (rc == GPTP_NO_WORK) { /* Nothing to do */ result = true; } else if (rc == GPTP_INSERT) { /* It will fit, perform the insertion */ START_CRIT_SECTION(); /* Perform the page update, and set up WAL data about it */ btree->execPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, ptp_workspace, payloadrdata); MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; XLogRecData rdata[3]; ginxlogInsert xlrec; BlockIdData childblknos[2]; xlrec.node = btree->index->rd_node; xlrec.blkno = BufferGetBlockNumber(stack->buffer); xlrec.flags = xlflags; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(ginxlogInsert); /* * Log information about child if this was an insertion of a * downlink. */ if (BufferIsValid(childbuf)) { rdata[0].next = &rdata[1]; BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); rdata[1].buffer = InvalidBuffer; rdata[1].data = (char *) childblknos; rdata[1].len = sizeof(BlockIdData) * 2; rdata[1].next = &rdata[2]; rdata[2].buffer = childbuf; rdata[2].buffer_std = true; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = payloadrdata; } else rdata[0].next = payloadrdata; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* Insertion is complete. */ result = true; } else if (rc == GPTP_SPLIT) { /* * Didn't fit, need to split. The split has been computed in newlpage * and newrpage, which are pointers to palloc'd pages, not associated * with buffers. stack->buffer is not touched yet. */ Buffer rbuffer; BlockNumber savedRightLink; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; /* Get a new index page to become the right page */ rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } savedRightLink = GinPageGetOpaque(page)->rightlink; /* Begin setting up WAL record (which we might not use) */ data.node = btree->index->rd_node; data.rblkno = BufferGetBlockNumber(rbuffer); data.flags = xlflags; if (BufferIsValid(childbuf)) { data.leftChildBlkno = BufferGetBlockNumber(childbuf); data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } else data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; if (stack->parent == NULL) { /* * splitting the root, so we need to allocate new left page and * place pointers to left and right page on root page. */ lbuffer = GinNewBuffer(btree->index); /* During index build, count the new left page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } /* * root never has a right-link, so we borrow the rrlink field to * store the root block number. */ data.rrlink = BufferGetBlockNumber(stack->buffer); data.lblkno = BufferGetBlockNumber(lbuffer); data.flags |= GIN_SPLIT_ROOT; GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); /* * Construct a new root page containing downlinks to the new left * and right pages. (Do this in a temporary copy rather than * overwriting the original page directly, since we're not in the * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); btree->fillRoot(btree, newrootpg, BufferGetBlockNumber(lbuffer), newlpage, BufferGetBlockNumber(rbuffer), newrpage); } else { /* splitting a non-root page */ data.rrlink = savedRightLink; data.lblkno = BufferGetBlockNumber(stack->buffer); GinPageGetOpaque(newrpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); } /* * OK, we have the new contents of the left page in a temporary copy * now (newlpage), and likewise for the new contents of the * newly-allocated right block. The original page is still unchanged. * * If this is a root split, we also have a temporary page containing * the new contents of the root. */ START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); /* * Restore the temporary copies over the real buffers. */ if (stack->parent == NULL) { /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); memcpy(page, newrootpg, BLCKSZ); memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } else { /* Normal split, only two pages to update */ memcpy(page, newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } /* write WAL record */ if (RelationNeedsWAL(btree->index)) { XLogRecData rdata[2]; XLogRecPtr recptr; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogSplit); if (BufferIsValid(childbuf)) { rdata[0].next = &rdata[1]; rdata[1].buffer = childbuf; rdata[1].buffer_std = true; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = payloadrdata; } else rdata[0].next = payloadrdata; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* * We can release the locks/pins on the new pages now, but keep * stack->buffer locked. childbuf doesn't get unlocked either. */ UnlockReleaseBuffer(rbuffer); if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to * the parent. */ result = (stack->parent == NULL); } else { elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc); result = false; /* keep compiler quiet */ } /* Clean up temp context */ MemoryContextSwitchTo(oldCxt); MemoryContextDelete(tmpCxt); return result; }
/* * fills WAL record for vacuum leaf page */ static void xlogVacuumPage(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); XLogRecPtr recptr; XLogRecData rdata[3]; ginxlogVacuumPage data; char *backup; char itups[BLCKSZ]; uint32 len = 0; Assert(GinPageIsLeaf(page)); if (index->rd_istemp) return; data.node = index->rd_node; data.blkno = BufferGetBlockNumber(buffer); if (GinPageIsData(page)) { backup = GinDataPageGetData(page); data.nitem = GinPageGetOpaque(page)->maxoff; if (data.nitem) len = MAXALIGN(sizeof(ItemPointerData) * data.nitem); } else { char *ptr; OffsetNumber i; ptr = backup = itups; for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); memcpy(ptr, itup, IndexTupleSize(itup)); ptr += MAXALIGN(IndexTupleSize(itup)); } data.nitem = PageGetMaxOffsetNumber(page); len = ptr - backup; } rdata[0].buffer = buffer; rdata[0].buffer_std = (GinPageIsData(page)) ? FALSE : TRUE; rdata[0].len = 0; rdata[0].data = NULL; rdata[0].next = rdata + 1; rdata[1].buffer = InvalidBuffer; rdata[1].len = sizeof(ginxlogVacuumPage); rdata[1].data = (char *) &data; if (len == 0) { rdata[1].next = NULL; } else { rdata[1].next = rdata + 2; rdata[2].buffer = InvalidBuffer; rdata[2].len = len; rdata[2].data = backup; rdata[2].next = NULL; } recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); }
/* * _bitmap_log_updatewords() -- log updating bitmap words in one or * two bitmap pages. * * If nextBuffer is Invalid, we only update one page. * */ void _bitmap_log_updatewords(Relation rel, Buffer lovBuffer, OffsetNumber lovOffset, Buffer firstBuffer, Buffer secondBuffer, bool new_lastpage) { Page firstPage = NULL; Page secondPage = NULL; BMBitmap firstBitmap; BMBitmap secondBitmap; BMBitmapOpaque firstOpaque; BMBitmapOpaque secondOpaque; xl_bm_updatewords xlBitmapWords; XLogRecPtr recptr; XLogRecData rdata[1]; firstPage = BufferGetPage(firstBuffer); firstBitmap = (BMBitmap) PageGetContentsMaxAligned(firstPage); firstOpaque = (BMBitmapOpaque)PageGetSpecialPointer(firstPage); xlBitmapWords.bm_two_pages = false; xlBitmapWords.bm_first_blkno = BufferGetBlockNumber(firstBuffer); memcpy(&xlBitmapWords.bm_first_cwords, firstBitmap->cwords, BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_HRL_WORD)); memcpy(&xlBitmapWords.bm_first_hwords, firstBitmap->hwords, BM_NUM_OF_HEADER_WORDS * sizeof(BM_HRL_WORD)); xlBitmapWords.bm_first_last_tid = firstOpaque->bm_last_tid_location; xlBitmapWords.bm_first_num_cwords = firstOpaque->bm_hrl_words_used; xlBitmapWords.bm_next_blkno = firstOpaque->bm_bitmap_next; if (BufferIsValid(secondBuffer)) { secondPage = BufferGetPage(secondBuffer); secondBitmap = (BMBitmap) PageGetContentsMaxAligned(secondPage); secondOpaque = (BMBitmapOpaque)PageGetSpecialPointer(secondPage); xlBitmapWords.bm_two_pages = true; xlBitmapWords.bm_second_blkno = BufferGetBlockNumber(secondBuffer); memcpy(&xlBitmapWords.bm_second_cwords, secondBitmap->cwords, BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_HRL_WORD)); memcpy(&xlBitmapWords.bm_second_hwords, secondBitmap->hwords, BM_NUM_OF_HEADER_WORDS * sizeof(BM_HRL_WORD)); xlBitmapWords.bm_second_last_tid = secondOpaque->bm_last_tid_location; xlBitmapWords.bm_second_num_cwords = secondOpaque->bm_hrl_words_used; xlBitmapWords.bm_next_blkno = secondOpaque->bm_bitmap_next; } // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); xlBitmapWords.bm_node = rel->rd_node; xlBitmapWords.bm_persistentTid = rel->rd_relationnodeinfo.persistentTid; xlBitmapWords.bm_persistentSerialNum = rel->rd_relationnodeinfo.persistentSerialNum; xlBitmapWords.bm_lov_blkno = BufferGetBlockNumber(lovBuffer); xlBitmapWords.bm_lov_offset = lovOffset; xlBitmapWords.bm_new_lastpage = new_lastpage; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char*)&xlBitmapWords; rdata[0].len = sizeof(xl_bm_updatewords); rdata[0].next = NULL; recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_UPDATEWORDS, rdata); PageSetLSN(firstPage, recptr); PageSetTLI(firstPage, ThisTimeLineID); if (BufferIsValid(secondBuffer)) { PageSetLSN(secondPage, recptr); PageSetTLI(secondPage, ThisTimeLineID); } if (new_lastpage) { Page lovPage = BufferGetPage(lovBuffer); PageSetLSN(lovPage, recptr); PageSetTLI(lovPage, ThisTimeLineID); } }
/* * _bitmap_log_bitmapwords() -- log new bitmap words to be inserted. */ void _bitmap_log_bitmapwords(Relation rel, Buffer bitmapBuffer, Buffer lovBuffer, OffsetNumber lovOffset, BMTIDBuffer* buf, uint64 words_written, uint64 tidnum, BlockNumber nextBlkno, bool isLast, bool isFirst) { Page bitmapPage; BMBitmapOpaque bitmapPageOpaque; xl_bm_bitmapwords *xlBitmapWords; XLogRecPtr recptr; XLogRecData rdata[1]; uint64* lastTids; BM_HRL_WORD* cwords; BM_HRL_WORD* hwords; int lastTids_size; int cwords_size; int hwords_size; Page lovPage = BufferGetPage(lovBuffer); // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); lastTids_size = buf->curword * sizeof(uint64); cwords_size = buf->curword * sizeof(BM_HRL_WORD); hwords_size = (BM_CALC_H_WORDS(buf->curword)) * sizeof(BM_HRL_WORD); bitmapPage = BufferGetPage(bitmapBuffer); bitmapPageOpaque = (BMBitmapOpaque)PageGetSpecialPointer(bitmapPage); xlBitmapWords = (xl_bm_bitmapwords *) palloc0(MAXALIGN(sizeof(xl_bm_bitmapwords)) + MAXALIGN(lastTids_size) + MAXALIGN(cwords_size) + MAXALIGN(hwords_size)); xlBitmapWords->bm_node = rel->rd_node; xlBitmapWords->bm_persistentTid = rel->rd_relationnodeinfo.persistentTid; xlBitmapWords->bm_persistentSerialNum = rel->rd_relationnodeinfo.persistentSerialNum; xlBitmapWords->bm_blkno = BufferGetBlockNumber(bitmapBuffer); xlBitmapWords->bm_next_blkno = nextBlkno; xlBitmapWords->bm_last_tid = bitmapPageOpaque->bm_last_tid_location; xlBitmapWords->bm_lov_blkno = BufferGetBlockNumber(lovBuffer); xlBitmapWords->bm_lov_offset = lovOffset; xlBitmapWords->bm_last_compword = buf->last_compword; xlBitmapWords->bm_last_word = buf->last_word; xlBitmapWords->lov_words_header = (buf->is_last_compword_fill) ? 2 : 0; xlBitmapWords->bm_last_setbit = tidnum; xlBitmapWords->bm_is_last = isLast; xlBitmapWords->bm_is_first = isFirst; xlBitmapWords->bm_start_wordno = buf->start_wordno; xlBitmapWords->bm_words_written = words_written; xlBitmapWords->bm_num_cwords = buf->curword; lastTids = (uint64*)(((char*)xlBitmapWords) + MAXALIGN(sizeof(xl_bm_bitmapwords))); memcpy(lastTids, buf->last_tids, buf->curword * sizeof(uint64)); cwords = (BM_HRL_WORD*)(((char*)xlBitmapWords) + MAXALIGN(sizeof(xl_bm_bitmapwords)) + MAXALIGN(lastTids_size)); memcpy(cwords, buf->cwords, cwords_size); hwords = (BM_HRL_WORD*)(((char*)xlBitmapWords) + MAXALIGN(sizeof(xl_bm_bitmapwords)) + MAXALIGN(lastTids_size) + MAXALIGN(cwords_size)); memcpy(hwords, buf->hwords, hwords_size); rdata[0].buffer = InvalidBuffer; rdata[0].data = (char*)xlBitmapWords; rdata[0].len = MAXALIGN(sizeof(xl_bm_bitmapwords)) + MAXALIGN(lastTids_size) + MAXALIGN(cwords_size) + MAXALIGN(hwords_size); rdata[0].next = NULL; recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_WORDS, rdata); PageSetLSN(bitmapPage, recptr); PageSetTLI(bitmapPage, ThisTimeLineID); PageSetLSN(lovPage, recptr); PageSetTLI(lovPage, ThisTimeLineID); pfree(xlBitmapWords); }
/* * Write WAL record of a page split. */ XLogRecPtr gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, Buffer leftchildbuf) { XLogRecData *rdata; gistxlogPageSplit xlrec; SplitedPageLayout *ptr; int npage = 0, cur; XLogRecPtr recptr; for (ptr = dist; ptr; ptr = ptr->next) npage++; rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2)); xlrec.node = node; xlrec.origblkno = blkno; xlrec.origrlink = origrlink; xlrec.orignsn = orignsn; xlrec.origleaf = page_is_leaf; xlrec.npage = (uint16) npage; xlrec.leftchild = BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(gistxlogPageSplit); rdata[0].buffer = InvalidBuffer; cur = 1; /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) { rdata[cur - 1].next = &(rdata[cur]); rdata[cur].data = NULL; rdata[cur].len = 0; rdata[cur].buffer = leftchildbuf; rdata[cur].buffer_std = true; cur++; } for (ptr = dist; ptr; ptr = ptr->next) { rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) &(ptr->block); rdata[cur].len = sizeof(gistxlogPage); cur++; rdata[cur - 1].next = &(rdata[cur]); rdata[cur].buffer = InvalidBuffer; rdata[cur].data = (char *) (ptr->list); rdata[cur].len = ptr->lenlist; cur++; } rdata[cur - 1].next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); pfree(rdata); return recptr; }
/* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. * * If this update inserts a downlink for a split page, also record that * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. * * Note that both the todelete array and the tuples are marked as belonging * to the target buffer; they need not be stored in XLOG if XLogInsert decides * to log the whole buffer contents instead. Also, we take care that there's * at least one rdata item referencing the buffer, even when ntodelete and * ituplen are both zero; this ensures that XLogInsert knows about the buffer. */ XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ituplen, Buffer leftchildbuf) { XLogRecData *rdata; gistxlogPageUpdate *xlrec; int cur, i; XLogRecPtr recptr; rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen)); xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate)); xlrec->node = node; xlrec->blkno = BufferGetBlockNumber(buffer); xlrec->ntodelete = ntodelete; xlrec->leftchild = BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; rdata[0].buffer = buffer; rdata[0].buffer_std = true; rdata[0].data = NULL; rdata[0].len = 0; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) xlrec; rdata[1].len = sizeof(gistxlogPageUpdate); rdata[1].buffer = InvalidBuffer; rdata[1].next = &(rdata[2]); rdata[2].data = (char *) todelete; rdata[2].len = sizeof(OffsetNumber) * ntodelete; rdata[2].buffer = buffer; rdata[2].buffer_std = true; cur = 3; /* new tuples */ for (i = 0; i < ituplen; i++) { rdata[cur - 1].next = &(rdata[cur]); rdata[cur].data = (char *) (itup[i]); rdata[cur].len = IndexTupleSize(itup[i]); rdata[cur].buffer = buffer; rdata[cur].buffer_std = true; cur++; } /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) { rdata[cur - 1].next = &(rdata[cur]); rdata[cur].data = NULL; rdata[cur].len = 0; rdata[cur].buffer = leftchildbuf; rdata[cur].buffer_std = true; cur++; } rdata[cur - 1].next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); pfree(rdata); return recptr; }
/* * _bt_pagedel() -- Delete a page from the b-tree. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and read-locked. This lock and * pin will be dropped before exiting. * * Returns the number of pages successfully deleted (zero on failure; could * be more than one if parent blocks were deleted). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) { BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; BTItem targetkey, btitem; ScanKey itup_scankey; BTStack stack; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid)); /* * We need to get an approximate pointer to the page's parent page. Use * the standard search mechanism to search for the page's high key; this * will give us a link to either the current parent or someplace to its * left (if there are multiple equal high keys). To avoid deadlocks, we'd * better drop the target page lock first. */ _bt_relbuf(rel, buf); /* we need a scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup)); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did more than * we needed. Locate the stack item pointing to our parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. However, * it's possible for the search to fail (for reasons explained in README). * If that happens, we recover by searching the whole parent level, which * is a tad inefficient but doesn't happen often enough to be a problem. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) { /* Find the leftmost page in the parent level */ pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; _bt_relbuf(rel, pbuf); /* and repeat search from there */ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); } parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, rbuf); _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); btitem = (BTItem) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) == target); ItemPointerSet(&(btitem->bti_itup.t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); /* This part is just for double-checking */ itemid = PageGetItemId(page, nextoffset); btitem = (BTItem) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) != rightsib) elog(PANIC, "right sibling is not next child in \"%s\"", RelationGetRelationName(rel)); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone; or immediately if we're doing VACUUM FULL. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = vacuum_full ? FrozenTransactionId : ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; } /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* Write and release buffers */ if (BufferIsValid(metabuf)) _bt_wrtbuf(rel, metabuf); _bt_wrtbuf(rel, pbuf); _bt_wrtbuf(rel, rbuf); _bt_wrtbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_wrtbuf(rel, lbuf); /* * If parent became half dead, recurse to try to delete it. Otherwise, if * right sibling is empty and is now the last child of the parent, recurse * to try to delete it. (These cases cannot apply at the same time, * though the second case might itself recurse to the first.) */ if (parent_half_dead) { buf = _bt_getbuf(rel, parent, BT_READ); return _bt_pagedel(rel, buf, vacuum_full) + 1; } if (parent_one_child && rightsib_empty) { buf = _bt_getbuf(rel, rightsib, BT_READ); return _bt_pagedel(rel, buf, vacuum_full) + 1; } return 1; }
/* * _bt_getroot() -- Get the root page of the btree. * * Since the root page can move around the btree file, we have to read * its location from the metadata page, and then read the root page * itself. If no root page exists yet, we have to create one. The * standard class of race conditions exists here; I think I covered * them all in the Hopi Indian rain dance of lock requests below. * * The access type parameter (BT_READ or BT_WRITE) controls whether * a new root page will be created or not. If access = BT_READ, * and no root page exists, we just return InvalidBuffer. For * BT_WRITE, we try to create the root page if it doesn't exist. * NOTE that the returned root page will have only a read lock set * on it even if access = BT_WRITE! * * The returned page is not necessarily the true root --- it could be * a "fast root" (a page that is alone in its level due to deletions). * Also, if the root page is split while we are "in flight" to it, * what we will return is the old root, which is now just the leftmost * page on a probably-not-very-wide level. For most purposes this is * as good as or better than the true root, so we do not bother to * insist on finding the true root. We do, however, guarantee to * return a live (not deleted or half-dead) page. * * On successful return, the root page is pinned and read-locked. * The metadata page is not locked or pinned on exit. */ Buffer _bt_getroot(Relation rel, int access) { Buffer metabuf; Page metapg; BTPageOpaque metaopaque; Buffer rootbuf; Page rootpage; BTPageOpaque rootopaque; BlockNumber rootblkno; uint32 rootlevel; BTMetaPageData *metad; metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); metad = BTPageGetMeta(metapg); /* sanity-check the metapage */ if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d, code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) { /* If access = BT_READ, caller doesn't want us to create root yet */ if (access == BT_READ) { _bt_relbuf(rel, metabuf); return InvalidBuffer; } /* trade in our read lock for a write lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we * must avoid doing it again. */ if (metad->btm_root != P_NONE) { /* * Metadata initialized by someone else. In order to guarantee no * deadlocks, we have to release the metadata page and start all * over again. (Is that really true? But it's hardly worth trying * to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* * Get, initialize, write, and leave a lock of the appropriate type on * the new root page. Since this is the first page in the tree, it's * a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); rootopaque->btpo.level = 0; /* NO ELOG(ERROR) till meta is updated */ START_CRIT_SECTION(); metad->btm_root = rootblkno; metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_newroot xlrec; XLogRecPtr recptr; XLogRecData rdata; xlrec.node = rel->rd_node; xlrec.rootblk = rootblkno; xlrec.level = 0; rdata.data = (char *) &xlrec; rdata.len = SizeOfBtreeNewroot; rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata); PageSetLSN(rootpage, recptr); PageSetTLI(rootpage, ThisTimeLineID); PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } END_CRIT_SECTION(); _bt_wrtnorelbuf(rel, rootbuf); /* * swap root write lock for read lock. There is no danger of anyone * else accessing the new root page while it's unlocked, since no one * else knows where it is yet. */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); /* okay, metadata is correct, write and release it */ _bt_wrtbuf(rel, metabuf); } else { rootblkno = metad->btm_fastroot; Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; /* * We are done with the metapage; arrange to release it via first * _bt_relandgetbuf call */ rootbuf = metabuf; for (;;) { rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); if (!P_IGNORE(rootopaque)) break; /* it's dead, Jim. step right one page */ if (P_RIGHTMOST(rootopaque)) elog(ERROR, "no live root page found in \"%s\"", RelationGetRelationName(rel)); rootblkno = rootopaque->btpo_next; } /* Note: can't check btpo.level on deleted pages */ if (rootopaque->btpo.level != rootlevel) elog(ERROR, "root page %u of \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), rootopaque->btpo.level, rootlevel); } /* * By here, we have a pin and read lock on the root page, and no lock set * on the metadata page. Return the root page's buffer. */ return rootbuf; }
/* * Drop a table space * * Be careful to check that the tablespace is empty. */ void DropTableSpace(DropTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK char *tablespacename = stmt->tablespacename; HeapScanDesc scandesc; Relation rel; HeapTuple tuple; ScanKeyData entry[1]; Oid tablespaceoid; /* don't call this in a transaction block */ PreventTransactionChain((void *) stmt, "DROP TABLESPACE"); /* * Acquire ExclusiveLock on pg_tablespace to ensure that no one else is * trying to do DROP TABLESPACE or TablespaceCreateDbspace concurrently. */ rel = heap_open(TableSpaceRelationId, ExclusiveLock); /* * Find the target tuple */ ScanKeyInit(&entry[0], Anum_pg_tablespace_spcname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(tablespacename)); scandesc = heap_beginscan(rel, SnapshotNow, 1, entry); tuple = heap_getnext(scandesc, ForwardScanDirection); if (!HeapTupleIsValid(tuple)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tablespacename))); tablespaceoid = HeapTupleGetOid(tuple); /* Must be tablespace owner */ if (!pg_tablespace_ownercheck(tablespaceoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, tablespacename); /* Disallow drop of the standard tablespaces, even by superuser */ if (tablespaceoid == GLOBALTABLESPACE_OID || tablespaceoid == DEFAULTTABLESPACE_OID) aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE, tablespacename); /* * Remove the pg_tablespace tuple (this will roll back if we fail below) */ simple_heap_delete(rel, &tuple->t_self); heap_endscan(scandesc); /* * Remove dependency on owner. */ deleteSharedDependencyRecordsFor(TableSpaceRelationId, tablespaceoid); /* * Try to remove the physical infrastructure */ if (!remove_tablespace_directories(tablespaceoid, false)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("tablespace \"%s\" is not empty", tablespacename))); /* Record the filesystem change in XLOG */ { xl_tblspc_drop_rec xlrec; XLogRecData rdata[1]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(xl_tblspc_drop_rec); rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP, rdata); } /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
/* * Insert value (stored in GinBtree) to tree described by stack */ void ginInsertValue(GinBtree btree, GinBtreeStack *stack) { GinBtreeStack *parent = stack; BlockNumber rootBlkno = InvalidBuffer; Page page, rpage, lpage; /* remember root BlockNumber */ while (parent) { rootBlkno = parent->blkno; parent = parent->parent; } while (stack) { XLogRecData *rdata; BlockNumber savedRightLink; page = BufferGetPage(stack->buffer); savedRightLink = GinPageGetOpaque(page)->rightlink; if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) { START_CRIT_SECTION(); btree->placeToPage(btree, stack->buffer, stack->off, &rdata); MarkBufferDirty(stack->buffer); if (!btree->index->rd_istemp) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } UnlockReleaseBuffer(stack->buffer); END_CRIT_SECTION(); freeGinBtreeStack(stack->parent); return; } else { Buffer rbuffer = GinNewBuffer(btree->index); Page newlpage; /* * newlpage is a pointer to memory page, it doesn't associate with * buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; parent = stack->parent; if (parent == NULL) { /* * split root, so we need to allocate new left page and place * pointer on root to left and right page */ Buffer lbuffer = GinNewBuffer(btree->index); ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; page = BufferGetPage(stack->buffer); lpage = BufferGetPage(lbuffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer); START_CRIT_SECTION(); GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF); PageRestoreTempPage(newlpage, lpage); btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer); MarkBufferDirty(rbuffer); MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); if (!btree->index->rd_istemp) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); UnlockReleaseBuffer(stack->buffer); END_CRIT_SECTION(); return; } else { /* split non-root page */ ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE; ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink; lpage = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); START_CRIT_SECTION(); PageRestoreTempPage(newlpage, lpage); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); if (!btree->index->rd_istemp) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(lpage, recptr); PageSetTLI(lpage, ThisTimeLineID); PageSetLSN(rpage, recptr); PageSetTLI(rpage, ThisTimeLineID); } UnlockReleaseBuffer(rbuffer); END_CRIT_SECTION(); } } btree->isDelete = FALSE; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; LockBuffer(parent->buffer, GIN_UNLOCK); if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ findParents(btree, stack, rootBlkno); parent = stack->parent; page = BufferGetPage(parent->buffer); break; } parent->blkno = rightlink; parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); LockBuffer(parent->buffer, GIN_EXCLUSIVE); page = BufferGetPage(parent->buffer); } UnlockReleaseBuffer(stack->buffer); pfree(stack); stack = parent; } }
/* * Insert an index tuple into the index relation. The revmap is updated to * mark the range containing the given page as pointing to the inserted entry. * A WAL record is written. * * The buffer, if valid, is first checked for free space to insert the new * entry; if there isn't enough, a new buffer is obtained and pinned. No * buffer lock must be held on entry, no buffer lock is held on exit. * * Return value is the offset number where the tuple was inserted. */ OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz) { Page page; BlockNumber blk; OffsetNumber off; Buffer revmapbuf; ItemPointerData tid; bool extended; Assert(itemsz == MAXALIGN(itemsz)); /* If the item is oversized, don't even bother. */ if (itemsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return InvalidOffsetNumber; /* keep compiler quiet */ } /* Make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); /* * Acquire lock on buffer supplied by caller, if any. If it doesn't have * enough space, unpin it to obtain a new one below. */ if (BufferIsValid(*buffer)) { /* * It's possible that another backend (or ourselves!) extended the * revmap over the page we held a pin on, so we cannot assume that * it's still a regular page. */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) { UnlockReleaseBuffer(*buffer); *buffer = InvalidBuffer; } } /* * If we still don't have a usable buffer, have brin_getinsertbuffer * obtain one for us. */ if (!BufferIsValid(*buffer)) { do *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); while (!BufferIsValid(*buffer)); } else extended = false; /* Now obtain lock on revmap buffer */ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); page = BufferGetPage(*buffer); blk = BufferGetBlockNumber(*buffer); /* Execute the actual insertion */ START_CRIT_SECTION(); if (extended) brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) elog(ERROR, "could not insert new index tuple to page"); MarkBufferDirty(*buffer); BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", blk, off, heapBlk)); ItemPointerSet(&tid, blk, off); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_insert xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.heapBlk = heapBlk; xlrec.pagesPerRange = pagesPerRange; xlrec.offnum = off; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) tup, itemsz); XLogRegisterBuffer(1, revmapbuf, 0); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); /* Tuple is firmly on buffer; we can release our locks */ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return off; }
/* * Write a backup block if needed when we are setting a hint. Note that * this may be called for a variety of page types, not just heaps. * * Callable while holding just share lock on the buffer content. * * We can't use the plain backup block mechanism since that relies on the * Buffer being exclusively locked. Since some modifications (setting LSN, hint * bits) are allowed in a sharelocked buffer that can lead to wal checksum * failures. So instead we copy the page and insert the copied data as normal * record data. * * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. * * It is possible that multiple concurrent backends could attempt to write WAL * records. In that case, multiple copies of the same block would be recorded * in separate WAL records by different backends, though that is still OK from * a correctness perspective. */ XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std) { XLogRecPtr recptr = InvalidXLogRecPtr; XLogRecPtr lsn; XLogRecPtr RedoRecPtr; /* * Ensure no checkpoint can change our view of RedoRecPtr. */ Assert(MyPgXact->delayChkpt); /* * Update RedoRecPtr so that we can make the right decision */ RedoRecPtr = GetRedoRecPtr(); /* * We assume page LSN is first data on *every* page that can be passed to * XLogInsert, whether it has the standard page layout or not. Since we're * only holding a share-lock on the page, we must take the buffer header * lock when we look at the LSN. */ lsn = BufferGetLSNAtomic(buffer); if (lsn <= RedoRecPtr) { int flags; PGAlignedBlock copied_buffer; char *origdata = (char *) BufferGetBlock(buffer); RelFileNode rnode; ForkNumber forkno; BlockNumber blkno; /* * Copy buffer so we don't have to worry about concurrent hint bit or * lsn updates. We assume pd_lower/upper cannot be changed without an * exclusive lock, so the contents bkp are not racy. */ if (buffer_std) { /* Assume we can omit data between pd_lower and pd_upper */ Page page = BufferGetPage(buffer); uint16 lower = ((PageHeader) page)->pd_lower; uint16 upper = ((PageHeader) page)->pd_upper; memcpy(copied_buffer.data, origdata, lower); memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); } else memcpy(copied_buffer.data, origdata, BLCKSZ); XLogBeginInsert(); flags = REGBUF_FORCE_IMAGE; if (buffer_std) flags |= REGBUF_STANDARD; BufferGetTag(buffer, &rnode, &forkno, &blkno); XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); } return recptr; }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; bool extended; Assert(newsz == MAXALIGN(newsz)); /* If the item is oversized, don't bother. */ if (newsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return false; /* keep compiler quiet */ } /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); if (!BufferIsValid(newbuf)) { Assert(!extended); return false; } /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) { Assert(!extended); newbuf = InvalidBuffer; } } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * If this happens, and the new buffer was obtained by extending the * relation, then we need to ensure we don't leave it uninitialized or * forget about it. */ if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) { /* as above */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); } START_CRIT_SECTION(); if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); /* * We need to initialize the page if it's newly obtained. Note we * will WAL-log the initialization as part of the update, so we don't * need to do that here. */ if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); /* needed to update FSM below */ if (extended) { newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); } ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, 0); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); if (extended) { Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); FreeSpaceMapVacuum(idxrel); } return true; } }
static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) { Buffer dBuffer; Buffer lBuffer; Buffer pBuffer; Page page, parentPage; dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); if (leftBlkno != InvalidBlockNumber) lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); else lBuffer = InvalidBuffer; pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); if (leftBlkno != InvalidBlockNumber) LockBuffer(lBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); if (leftBlkno != InvalidBlockNumber) { BlockNumber rightlink; page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; } parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do { PostingItem *tod = (PostingItem *) GinDataPageGetItem(parentPage, myoff); Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif PageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); /* * we shouldn't change rightlink field to save workability of running * search scan */ GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(pBuffer); if (leftBlkno != InvalidBlockNumber) MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); if (!gvs->index->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata[4]; ginxlogDeletePage data; int n; data.node = gvs->index->rd_node; data.blkno = deleteBlkno; data.parentBlkno = parentBlkno; data.parentOffset = myoff; data.leftBlkno = leftBlkno; data.rightLink = GinPageGetOpaque(page)->rightlink; rdata[0].buffer = dBuffer; rdata[0].buffer_std = FALSE; rdata[0].data = NULL; rdata[0].len = 0; rdata[0].next = rdata + 1; rdata[1].buffer = pBuffer; rdata[1].buffer_std = FALSE; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = rdata + 2; if (leftBlkno != InvalidBlockNumber) { rdata[2].buffer = lBuffer; rdata[2].buffer_std = FALSE; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = rdata + 3; n = 3; } else n = 2; rdata[n].buffer = InvalidBuffer; rdata[n].buffer_std = FALSE; rdata[n].len = sizeof(ginxlogDeletePage); rdata[n].data = (char *) &data; rdata[n].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); PageSetLSN(parentPage, recptr); PageSetTLI(parentPage, ThisTimeLineID); if (leftBlkno != InvalidBlockNumber) { page = BufferGetPage(lBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); if (leftBlkno != InvalidBlockNumber) UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; }
static bool gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) { bool is_splitted = false; bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * if (!is_leaf) remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. * * XXX: If we want to change fillfactors between node and leaf, fillfactor * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor) */ if (gistnospace(state->stack->page, state->itup, state->ituplen, is_leaf ? InvalidOffsetNumber : state->stack->childoffnum, state->freespace)) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber rrlink = InvalidBlockNumber; GistNSN oldnsn; is_splitted = true; /* * Form index tuples vector to split: remove old tuple if t's needed * and add new tuples to vector */ itvec = gistextractpage(state->stack->page, &tlen); if (!is_leaf) { /* on inner page we should remove old tuple */ int pos = state->stack->childoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen); dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate); state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen); state->ituplen = 0; if (state->stack->blkno != GIST_ROOT_BLKNO) { /* * if non-root split then we should not allocate new buffer, but * we must create temporary page to operate */ dist->buffer = state->stack->buffer; dist->page = PageGetTempPage(BufferGetPage(dist->buffer), sizeof(GISTPageOpaqueData)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; } /* make new pages and fills them */ for (ptr = dist; ptr; ptr = ptr->next) { int i; char *data; /* get new page */ if (ptr->buffer == InvalidBuffer) { ptr->buffer = gistNewBuffer(state->r); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); } ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); /* * fill page, we can do it because all these pages are new * (ie not linked in tree or masked by temp page */ data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r)); data += IndexTupleSize((IndexTuple) data); } /* set up ItemPointer and remember it for parent */ ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); state->itup[state->ituplen] = ptr->itup; state->ituplen++; } /* saves old rightlink */ if (state->stack->blkno != GIST_ROOT_BLKNO) rrlink = GistPageGetOpaque(dist->page)->rightlink; START_CRIT_SECTION(); /* * must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. set up right links. */ for (ptr = dist; ptr; ptr = ptr->next) { MarkBufferDirty(ptr->buffer); GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ? ptr->next->block.blkno : rrlink; } /* restore splitted non-root page */ if (state->stack->blkno != GIST_ROOT_BLKNO) { PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); } if (!state->r->rd_istemp) { XLogRecPtr recptr; XLogRecData *rdata; rdata = formSplitRdata(state->r, state->stack->blkno, is_leaf, &(state->key), dist); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); PageSetTLI(ptr->page, ThisTimeLineID); } } else { for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, XLogRecPtrForTemp); } } /* set up NSN */ oldnsn = GistPageGetOpaque(dist->page)->nsn; if (state->stack->blkno == GIST_ROOT_BLKNO) /* if root split we should put initial value */ oldnsn = PageGetLSN(dist->page); for (ptr = dist; ptr; ptr = ptr->next) { /* only for last set oldnsn */ GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ? PageGetLSN(ptr->page) : oldnsn; } /* * release buffers, if it was a root split then release all buffers * because we create all buffers */ ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next; for (; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); if (state->stack->blkno == GIST_ROOT_BLKNO) { gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key)); state->needInsertComplete = false; } END_CRIT_SECTION(); } else { /* enough space */ START_CRIT_SECTION(); if (!is_leaf) PageIndexTupleDelete(state->stack->page, state->stack->childoffnum); gistfillbuffer(state->r, state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber); MarkBufferDirty(state->stack->buffer); if (!state->r->rd_istemp) { OffsetNumber noffs = 0, offs[1]; XLogRecPtr recptr; XLogRecData *rdata; if (!is_leaf) { /* only on inner page we should delete previous version */ offs[0] = state->stack->childoffnum; noffs = 1; } rdata = formUpdateRdata(state->r, state->stack->buffer, offs, noffs, state->itup, state->ituplen, &(state->key)); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(state->stack->page, recptr); PageSetTLI(state->stack->page, ThisTimeLineID); } else PageSetLSN(state->stack->page, XLogRecPtrForTemp); if (state->stack->blkno == GIST_ROOT_BLKNO) state->needInsertComplete = false; END_CRIT_SECTION(); if (state->ituplen > 1) { /* previous is_splitted==true */ /* * child was splited, so we must form union for insertion in * parent */ IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate); ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno); state->itup[0] = newtup; state->ituplen = 1; } else if (is_leaf) { /* * itup[0] store key to adjust parent, we set it to valid to * correct check by GistTupleIsInvalid macro in gistgetadjusted() */ ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno); GistTupleSetValid(state->itup[0]); } } return is_splitted; }
/* * Vacuum a regular (non-root) leaf page * * We must delete tuples that are targeted for deletion by the VACUUM, * but not move any tuples that are referenced by outside links; we assume * those are the ones that are heads of chains. * * If we find a REDIRECT that was made by a concurrently-running transaction, * we must add its target TID to pendingList. (We don't try to visit the * target immediately, first because we don't want VACUUM locking more than * one buffer at a time, and second because the duplicate-filtering logic * in spgAddPendingTID is useful to ensure we can't get caught in an infinite * loop in the face of continuous concurrent insertions.) * * If forPending is true, we are examining the page as a consequence of * chasing a redirect link, not as part of the normal sequential scan. * We still vacuum the page normally, but we don't increment the stats * about live tuples; else we'd double-count those tuples, since the page * has been or will be visited in the sequential scan as well. */ static void vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, bool forPending) { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; OffsetNumber toDead[MaxIndexTuplesPerPage]; OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber moveSrc[MaxIndexTuplesPerPage]; OffsetNumber moveDest[MaxIndexTuplesPerPage]; OffsetNumber chainSrc[MaxIndexTuplesPerPage]; OffsetNumber chainDest[MaxIndexTuplesPerPage]; OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; bool deletable[MaxIndexTuplesPerPage + 1]; int nDeletable; OffsetNumber i, max = PageGetMaxOffsetNumber(page); memset(predecessor, 0, sizeof(predecessor)); memset(deletable, 0, sizeof(deletable)); nDeletable = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; deletable[i] = true; nDeletable++; } else { if (!forPending) bds->stats->num_index_tuples += 1; } /* Form predecessor map, too */ if (lt->nextOffset != InvalidOffsetNumber) { /* paranoia about corrupted chain links */ if (lt->nextOffset < FirstOffsetNumber || lt->nextOffset > max || predecessor[lt->nextOffset] != InvalidOffsetNumber) elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", BufferGetBlockNumber(buffer), RelationGetRelationName(index)); predecessor[lt->nextOffset] = i; } } else if (lt->tupstate == SPGIST_REDIRECT) { SpGistDeadTuple dt = (SpGistDeadTuple) lt; Assert(dt->nextOffset == InvalidOffsetNumber); Assert(ItemPointerIsValid(&dt->pointer)); /* * Add target TID to pending list if the redirection could have * happened since VACUUM started. * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but snapmgr.c doesn't * currently export a suitable API, and it's not entirely clear * that a tighter test is worth the cycles anyway. */ if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else { Assert(lt->nextOffset == InvalidOffsetNumber); } } if (nDeletable == 0) return; /* nothing more to do */ /*---------- * Figure out exactly what we have to do. We do this separately from * actually modifying the page, mainly so that we have a representation * that can be dumped into WAL and then the replay code can do exactly * the same thing. The output of this step consists of six arrays * describing four kinds of operations, to be performed in this order: * * toDead[]: tuple numbers to be replaced with DEAD tuples * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples * moveSrc[]: tuple numbers that need to be relocated to another offset * (replacing the tuple there) and then replaced with PLACEHOLDER tuples * moveDest[]: new locations for moveSrc tuples * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates * chainDest[]: new values of nextOffset for chainSrc members * * It's easiest to figure out what we have to do by processing tuple * chains, so we iterate over all the tuples (not just the deletable * ones!) to identify chain heads, then chase down each chain and make * work item entries for deletable tuples within the chain. *---------- */ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple head; bool interveningDeletable; OffsetNumber prevLive; OffsetNumber j; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (head->tupstate != SPGIST_LIVE) continue; /* can't be a chain member */ if (predecessor[i] != 0) continue; /* not a chain head */ /* initialize ... */ interveningDeletable = false; prevLive = deletable[i] ? InvalidOffsetNumber : i; /* scan down the chain ... */ j = head->nextOffset; while (j != InvalidOffsetNumber) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, j)); if (lt->tupstate != SPGIST_LIVE) { /* all tuples in chain should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } if (deletable[j]) { /* This tuple should be replaced by a placeholder */ toPlaceholder[xlrec.nPlaceholder] = j; xlrec.nPlaceholder++; /* previous live tuple's chain link will need an update */ interveningDeletable = true; } else if (prevLive == InvalidOffsetNumber) { /* * This is the first live tuple in the chain. It has to move * to the head position. */ moveSrc[xlrec.nMove] = j; moveDest[xlrec.nMove] = i; xlrec.nMove++; /* Chain updates will be applied after the move */ prevLive = i; interveningDeletable = false; } else { /* * Second or later live tuple. Arrange to re-chain it to the * previous live one, if there was a gap. */ if (interveningDeletable) { chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = j; xlrec.nChain++; } prevLive = j; interveningDeletable = false; } j = lt->nextOffset; } if (prevLive == InvalidOffsetNumber) { /* The chain is entirely removable, so we need a DEAD tuple */ toDead[xlrec.nDead] = i; xlrec.nDead++; } else if (interveningDeletable) { /* One or more deletions at end of chain, so close it off */ chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = InvalidOffsetNumber; xlrec.nChain++; } } /* sanity check ... */ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) elog(ERROR, "inconsistent counts of deletable tuples"); /* Do the updates */ START_CRIT_SECTION(); spgPageIndexMultiDelete(&bds->spgstate, page, toDead, xlrec.nDead, SPGIST_DEAD, SPGIST_DEAD, InvalidBlockNumber, InvalidOffsetNumber); spgPageIndexMultiDelete(&bds->spgstate, page, toPlaceholder, xlrec.nPlaceholder, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* * We implement the move step by swapping the line pointers of the source * and target tuples, then replacing the newly-source tuples with * placeholders. This is perhaps unduly friendly with the page data * representation, but it's fast and doesn't risk page overflow when a * tuple to be relocated is large. */ for (i = 0; i < xlrec.nMove; i++) { ItemId idSrc = PageGetItemId(page, moveSrc[i]); ItemId idDest = PageGetItemId(page, moveDest[i]); ItemIdData tmp; tmp = *idSrc; *idSrc = *idDest; *idDest = tmp; } spgPageIndexMultiDelete(&bds->spgstate, page, moveSrc, xlrec.nMove, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); for (i = 0; i < xlrec.nChain; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, chainSrc[i])); Assert(lt->tupstate == SPGIST_LIVE); lt->nextOffset = chainDest[i]; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Routine to build an index. Basically calls insert over and over. * * XXX: it would be nice to implement some sort of bulk-loading * algorithm, but it is not clear how to do that. */ Datum gistbuild(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_DECLARE; Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* no locking is needed */ initGISTstate(&buildstate.giststate, index); // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (!index->rd_istemp) { XLogRecPtr recptr; XLogRecData *rdata; rdata = formCreateRData(index); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, XLogRecPtrForTemp); UnlockReleaseBuffer(buffer); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- END_CRIT_SECTION(); /* build the index */ buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs; buildstate.indtuples = 0; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = createTempGistContext(); /* do the heap scan */ reltuples = IndexBuildScan(heap, index, indexInfo, gistbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ MemoryContextDelete(buildstate.tmpCtx); freeGISTstate(&buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * Clean up redirect and placeholder tuples on the given page * * Redirect tuples can be marked placeholder once they're old enough. * Placeholder tuples can be removed if it won't change the offsets of * non-placeholder ones. * * Unlike the routines above, this works on both leaf and inner pages. */ static void vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); OffsetNumber i, max = PageGetMaxOffsetNumber(page), firstPlaceholder = InvalidOffsetNumber; bool hasNonPlaceholder = false; bool hasUpdate = false; OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; START_CRIT_SECTION(); /* * Scan backwards to convert old redirection tuples to placeholder tuples, * and identify location of last non-placeholder tuple while at it. */ for (i = max; i >= FirstOffsetNumber && (opaque->nRedirection > 0 || !hasNonPlaceholder); i--) { SpGistDeadTuple dt; dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); opaque->nRedirection--; opaque->nPlaceholder++; /* remember newest XID among the removed redirects */ if (!TransactionIdIsValid(xlrec.newestRedirectXid) || TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid)) xlrec.newestRedirectXid = dt->xid; ItemPointerSetInvalid(&dt->pointer); itemToPlaceholder[xlrec.nToPlaceholder] = i; xlrec.nToPlaceholder++; hasUpdate = true; } if (dt->tupstate == SPGIST_PLACEHOLDER) { if (!hasNonPlaceholder) firstPlaceholder = i; } else { hasNonPlaceholder = true; } } /* * Any placeholder tuples at the end of page can safely be removed. We * can't remove ones before the last non-placeholder, though, because we * can't alter the offset numbers of non-placeholder tuples. */ if (firstPlaceholder != InvalidOffsetNumber) { /* * We do not store this array to rdata because it's easy to recreate. */ for (i = firstPlaceholder; i <= max; i++) itemnos[i - firstPlaceholder] = i; i = max - firstPlaceholder + 1; Assert(opaque->nPlaceholder >= i); opaque->nPlaceholder -= i; /* The array is surely sorted, so can use PageIndexMultiDelete */ PageIndexMultiDelete(page, itemnos, i); hasUpdate = true; } xlrec.firstPlaceholder = firstPlaceholder; if (hasUpdate) MarkBufferDirty(buffer); if (hasUpdate && RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); XLogRegisterData((char *) itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * brinbuild() -- build a new BRIN index. */ IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) { ereport(LOG,(errmsg("brinbuild start"))); IndexBuildResult *result; double reltuples; double idxtuples; BrinRevmap *revmap; BrinBuildState *state; Buffer meta; BlockNumber pagesPerRange; /* * We expect to be called exactly once for any index relation. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ meta = ReadBuffer(index, P_NEW); Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); MarkBufferDirty(meta); if (RelationNeedsWAL(index)) { xl_brin_createidx xlrec; XLogRecPtr recptr; Page page; xlrec.version = BRIN_CURRENT_VERSION; xlrec.pagesPerRange = BrinGetPagesPerRange(index); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT); recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); page = BufferGetPage(meta); PageSetLSN(page, recptr); } UnlockReleaseBuffer(meta); /* * Initialize our state, including the deformed tuple state. */ revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); state = initialize_brin_buildstate(index, revmap, pagesPerRange); /* * Now scan the relation. No syncscan allowed here because we want the * heap blocks in physical order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, brinbuildCallback, (void *) state); /* process the final batch */ form_and_insert_tuple(state); /* release resources */ idxtuples = state->bs_numtuples; brinRevmapTerminate(state->bs_rmAccess); terminate_brin_buildstate(state); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = idxtuples; ereport(LOG,(errmsg("brinbuild stop"))); return result; }
/* * Write the index tuples contained in *collector into the index's * pending list. * * Function guarantees that all these tuples will be inserted consecutively, * preserving order */ void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; XLogRecData rdata[2]; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; if (collector->ntuples == 0) return; data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogUpdateMeta); rdata[0].next = NULL; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* * Total size is greater than one page => make sublist */ separateList = true; } else { LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber || collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) { /* * Pending list is empty or total size is greater than freespace * on tail page => make sublist * * We unlock metabuffer to keep high concurrency */ separateList = true; LockBuffer(metabuffer, GIN_UNLOCK); } } if (separateList) { /* * We should make sublist separately and append it to the tail */ GinMetaPageData sublist; memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); /* * metapage was unlocked, see above */ LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); metadata->head = sublist.head; metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { /* * Merge lists */ data.prevTail = metadata->tail; data.newRightlink = sublist.head; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); rdata[0].next = rdata + 1; rdata[1].buffer = buffer; rdata[1].buffer_std = true; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = NULL; Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); GinPageGetOpaque(page)->rightlink = sublist.head; MarkBufferDirty(buffer); metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; } } else { /* * Insert into tail page. Metapage is already locked */ OffsetNumber l, off; int i, tupsize; char *ptr; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); rdata[0].next = rdata + 1; rdata[1].buffer = buffer; rdata[1].buffer_std = true; ptr = rdata[1].data = (char *) palloc(collector->sumsize); rdata[1].len = collector->sumsize; rdata[1].next = NULL; data.ntuples = collector->ntuples; START_CRIT_SECTION(); /* * Increase counter of heap tuples */ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); GinPageGetOpaque(page)->maxoff++; metadata->nPendingHeapTuples++; for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); memcpy(ptr, collector->tuples[i], tupsize); ptr += tupsize; off++; } Assert((ptr - rdata[1].data) <= collector->sumsize); metadata->tailFreeSize = PageGetExactFreeSpace(page); MarkBufferDirty(buffer); } /* * Write metabuffer, make xlog entry */ MarkBufferDirty(metabuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); PageSetLSN(metapage, recptr); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); } } if (buffer != InvalidBuffer) UnlockReleaseBuffer(buffer); /* * Force pending list cleanup when it becomes too long. And, * ginInsertCleanup could take significant amount of time, so we prefer to * call it when it can do all the work in a single collection cycle. In * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it * while pending list is still small enough to fit into work_mem. * * ginInsertCleanup() should not be called inside our CRIT_SECTION. */ if (metadata->nPendingPages * GIN_PAGE_FREESIZE > work_mem * 1024L) needCleanup = true; UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); if (needCleanup) ginInsertCleanup(ginstate, false, NULL); }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32C(newmap->crc); COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32C(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); XLogBeginInsert(); XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate); XLogRegisterData((char *) newmap, sizeof(RelMapFile)); lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode, false); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; XLogRecData rdata[1]; Buffer buffers[GIN_NDELETE_AT_ONCE]; data.node = index->rd_node; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogDeleteListPages); rdata[0].next = NULL; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { data.toDelete[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); } while (blknoToDelete != newHead); return false; }
/* * Drop a table space * * Be careful to check that the tablespace is empty. */ void DropTableSpace(DropTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK char *tablespacename = stmt->tablespacename; HeapScanDesc scandesc; Relation rel; HeapTuple tuple; ScanKeyData entry[1]; Oid tablespaceoid; /* * Find the target tuple */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); ScanKeyInit(&entry[0], Anum_pg_tablespace_spcname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(tablespacename)); scandesc = heap_beginscan(rel, SnapshotNow, 1, entry); tuple = heap_getnext(scandesc, ForwardScanDirection); if (!HeapTupleIsValid(tuple)) { if (!stmt->missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tablespacename))); } else { ereport(NOTICE, (errmsg("tablespace \"%s\" does not exist, skipping", tablespacename))); /* XXX I assume I need one or both of these next two calls */ heap_endscan(scandesc); heap_close(rel, NoLock); } return; } tablespaceoid = HeapTupleGetOid(tuple); /* Must be tablespace owner */ if (!pg_tablespace_ownercheck(tablespaceoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, tablespacename); /* Disallow drop of the standard tablespaces, even by superuser */ if (tablespaceoid == GLOBALTABLESPACE_OID || tablespaceoid == DEFAULTTABLESPACE_OID) aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE, tablespacename); /* * Remove the pg_tablespace tuple (this will roll back if we fail below) */ simple_heap_delete(rel, &tuple->t_self); heap_endscan(scandesc); /* * Remove any comments on this tablespace. */ DeleteSharedComments(tablespaceoid, TableSpaceRelationId); /* * Remove dependency on owner. */ deleteSharedDependencyRecordsFor(TableSpaceRelationId, tablespaceoid, 0); /* * Acquire TablespaceCreateLock to ensure that no TablespaceCreateDbspace * is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Try to remove the physical infrastructure. */ if (!destroy_tablespace_directories(tablespaceoid, false)) { /* * Not all files deleted? However, there can be lingering empty files * in the directories, left behind by for example DROP TABLE, that * have been scheduled for deletion at next checkpoint (see comments * in mdunlink() for details). We could just delete them immediately, * but we can't tell them apart from important data files that we * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. */ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); if (!destroy_tablespace_directories(tablespaceoid, false)) { /* Still not empty, the files must be important then */ ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("tablespace \"%s\" is not empty", tablespacename))); } } /* Record the filesystem change in XLOG */ { xl_tblspc_drop_rec xlrec; XLogRecData rdata[1]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(xl_tblspc_drop_rec); rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP, rdata); } /* * Note: because we checked that the tablespace was empty, there should be * no need to worry about flushing shared buffers or free space map * entries for relations in the tablespace. */ /* * Force synchronous commit, to minimize the window between removing the * files on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); /* * Allow TablespaceCreateDbspace again. */ LWLockRelease(TablespaceCreateLock); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
/* * Create a table space * * Only superusers can create a tablespace. This seems a reasonable restriction * since we're determining the system layout and, anyway, we probably have * root if we're doing this kind of activity */ Oid CreateTableSpace(CreateTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK Relation rel; Datum values[Natts_pg_tablespace]; bool nulls[Natts_pg_tablespace]; HeapTuple tuple; Oid tablespaceoid; char *location; Oid ownerId; Datum newOptions; /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create tablespace \"%s\"", stmt->tablespacename), errhint("Must be superuser to create a tablespace."))); /* However, the eventual owner of the tablespace need not be */ if (stmt->owner) ownerId = get_rolespec_oid(stmt->owner, false); else ownerId = GetUserId(); /* Unix-ify the offered path, and strip any trailing slashes */ location = pstrdup(stmt->location); canonicalize_path(location); /* disallow quotes, else CREATE DATABASE would be at risk */ if (strchr(location, '\'')) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("tablespace location cannot contain single quotes"))); /* * Allowing relative paths seems risky * * this also helps us ensure that location is not empty or whitespace */ if (!is_absolute_path(location)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location must be an absolute path"))); /* * Check that location isn't too long. Remember that we're going to append * 'PG_XXX/<dboid>/<relid>_<fork>.<nnn>'. FYI, we never actually * reference the whole path here, but mkdir() uses the first two parts. */ if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 + OIDCHARS + 1 + OIDCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", location))); /* Warn if the tablespace is in the data directory. */ if (path_is_prefix_of_path(DataDir, location)) ereport(WARNING, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location should not be inside the data directory"))); /* * Disallow creation of tablespaces named "pg_xxx"; we reserve this * namespace for system purposes. */ if (!allowSystemTableMods && IsReservedName(stmt->tablespacename)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable tablespace name \"%s\"", stmt->tablespacename), errdetail("The prefix \"pg_\" is reserved for system tablespaces."))); /* * Check that there is no other tablespace by this name. (The unique * index would catch this anyway, but might as well give a friendlier * message.) */ if (OidIsValid(get_tablespace_oid(stmt->tablespacename, true))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("tablespace \"%s\" already exists", stmt->tablespacename))); /* * Insert tuple into pg_tablespace. The purpose of doing this first is to * lock the proposed tablename against other would-be creators. The * insertion will roll back if we find problems below. */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); MemSet(nulls, false, sizeof(nulls)); values[Anum_pg_tablespace_spcname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->tablespacename)); values[Anum_pg_tablespace_spcowner - 1] = ObjectIdGetDatum(ownerId); nulls[Anum_pg_tablespace_spcacl - 1] = true; /* Generate new proposed spcoptions (text array) */ newOptions = transformRelOptions((Datum) 0, stmt->options, NULL, NULL, false, false); (void) tablespace_reloptions(newOptions, true); if (newOptions != (Datum) 0) values[Anum_pg_tablespace_spcoptions - 1] = newOptions; else nulls[Anum_pg_tablespace_spcoptions - 1] = true; tuple = heap_form_tuple(rel->rd_att, values, nulls); tablespaceoid = simple_heap_insert(rel, tuple); CatalogUpdateIndexes(rel, tuple); heap_freetuple(tuple); /* Record dependency on owner */ recordDependencyOnOwner(TableSpaceRelationId, tablespaceoid, ownerId); /* Post creation hook for new tablespace */ InvokeObjectPostCreateHook(TableSpaceRelationId, tablespaceoid, 0); create_tablespace_directories(location, tablespaceoid); /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; xlrec.ts_id = tablespaceoid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, offsetof(xl_tblspc_create_rec, ts_path)); XLogRegisterData((char *) location, strlen(location) + 1); (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE); } /* * Force synchronous commit, to minimize the window between creating the * symlink on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); pfree(location); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); return tablespaceoid; #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); return InvalidOid; /* keep compiler quiet */ #endif /* HAVE_SYMLINK */ }
/* * Apply changes represented by GenericXLogState to the actual buffers, * and emit a generic xlog record. */ XLogRecPtr GenericXLogFinish(GenericXLogState *state) { XLogRecPtr lsn; int i; if (state->isLogged) { /* Logged relation: make xlog record in critical section. */ XLogBeginInsert(); START_CRIT_SECTION(); for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) { PageData *pageData = &state->pages[i]; Page page; PageHeader pageHeader; if (BufferIsInvalid(pageData->buffer)) continue; page = BufferGetPage(pageData->buffer); pageHeader = (PageHeader) pageData->image; if (pageData->flags & GENERIC_XLOG_FULL_IMAGE) { /* * A full-page image does not require us to supply any xlog * data. Just apply the image, being careful to zero the * "hole" between pd_lower and pd_upper in order to avoid * divergence between actual page state and what replay would * produce. */ memcpy(page, pageData->image, pageHeader->pd_lower); memset(page + pageHeader->pd_lower, 0, pageHeader->pd_upper - pageHeader->pd_lower); memcpy(page + pageHeader->pd_upper, pageData->image + pageHeader->pd_upper, BLCKSZ - pageHeader->pd_upper); XLogRegisterBuffer(i, pageData->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); } else { /* * In normal mode, calculate delta and write it as xlog data * associated with this page. */ computeDelta(pageData, page, (Page) pageData->image); /* Apply the image, with zeroed "hole" as above */ memcpy(page, pageData->image, pageHeader->pd_lower); memset(page + pageHeader->pd_lower, 0, pageHeader->pd_upper - pageHeader->pd_lower); memcpy(page + pageHeader->pd_upper, pageData->image + pageHeader->pd_upper, BLCKSZ - pageHeader->pd_upper); XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD); XLogRegisterBufData(i, pageData->delta, pageData->deltaLen); } } /* Insert xlog record */ lsn = XLogInsert(RM_GENERIC_ID, 0); /* Set LSN and mark buffers dirty */ for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) { PageData *pageData = &state->pages[i]; if (BufferIsInvalid(pageData->buffer)) continue; PageSetLSN(BufferGetPage(pageData->buffer), lsn); MarkBufferDirty(pageData->buffer); } END_CRIT_SECTION(); } else { /* Unlogged relation: skip xlog-related stuff */ START_CRIT_SECTION(); for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) { PageData *pageData = &state->pages[i]; if (BufferIsInvalid(pageData->buffer)) continue; memcpy(BufferGetPage(pageData->buffer), pageData->image, BLCKSZ); /* We don't worry about zeroing the "hole" in this case */ MarkBufferDirty(pageData->buffer); } END_CRIT_SECTION(); /* We don't have a LSN to return, in this case */ lsn = InvalidXLogRecPtr; } for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) pfree(state->pages[i].image); pfree(state); return lsn; }