/* * Delete item(s) from a btree page. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer. * Also, the given itemnos *must* appear in increasing order in the array. */ void _bt_delitems(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, bool inVacuum) { Page page; BTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; page = BufferGetPage(buf); // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ PageIndexMultiDelete(page, itemnos, nitems); /* * If this is within VACUUM, we can clear the vacuum cycleID since this * page has certainly been processed by the current vacuum scan. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (inVacuum) opaque->btpo_cycleid = 0; /* * Mark the page as not containing any LP_DELETE items. This is not * certainly true (there might be some that have recently been marked, but * weren't included in our target-item list), but it will almost always be * true and it doesn't seem worth an additional page scan to check it. * Remember that BTP_HAS_GARBAGE is only a hint anyway. */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; MarkBufferDirty(buf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; xl_btreenode_set(&(xlrec.btreenode), rel); xlrec.block = BufferGetBlockNumber(buf); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); /* * The target-offsets array is not in the buffer, but pretend that it * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ if (nitems > 0) { rdata[1].data = (char *) itemnos; rdata[1].len = nitems * sizeof(OffsetNumber); } else { rdata[1].data = NULL; rdata[1].len = 0; } rdata[1].buffer = buf; rdata[1].buffer_std = true; rdata[1].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); was_dirty = vslot->dirty; vslot->just_dirtied = false; SpinLockRelease(&vslot->mutex); } /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32C(cp.checksum); cp.version = SLOT_VERSION; cp.length = ReplicationSlotOnDiskV2Size; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32C(cp.checksum, (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } /* fsync the temporary file */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname((char *) dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); if (!vslot->just_dirtied) vslot->dirty = false; SpinLockRelease(&vslot->mutex); } LWLockRelease(slot->io_in_progress_lock); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (max_lsn < this_lsn) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) CloseTransientFile(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) CloseTransientFile(fd); return false; } /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } if (CloseTransientFile(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * Build a pending-list page from the given array of tuples, and write it out. * * Returns amount of free space left on the page. */ static int32 writeListPage(Relation index, Buffer buffer, IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) { Page page = BufferGetPage(buffer); int32 i, freesize, size = 0; OffsetNumber l, off; char *workspace; char *ptr; /* workspace could be a local array; we use palloc for alignment */ workspace = palloc(BLCKSZ); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_LIST); off = FirstOffsetNumber; ptr = workspace; for (i = 0; i < ntuples; i++) { int this_size = IndexTupleSize(tuples[i]); memcpy(ptr, tuples[i], this_size); ptr += this_size; size += this_size; l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); off++; } Assert(size <= BLCKSZ); /* else we overran workspace */ GinPageGetOpaque(page)->rightlink = rightlink; /* * tail page may contain only whole row(s) or final part of row placed on * previous pages (a "row" here meaning all the index tuples generated for * one heap tuple) */ if (rightlink == InvalidBlockNumber) { GinPageSetFullRow(page); GinPageGetOpaque(page)->maxoff = 1; } else { GinPageGetOpaque(page)->maxoff = 0; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { ginxlogInsertListPage data; XLogRecPtr recptr; data.rightlink = rightlink; data.ntuples = ntuples; XLogBeginInsert(); XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); XLogRegisterBufData(0, workspace, size); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); PageSetLSN(page, recptr); } /* get free space before releasing buffer */ freesize = PageGetExactFreeSpace(page); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); pfree(workspace); return freesize; }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active_pid = 0; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; Page page; XLogBeginInsert(); XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is used to hold data not yet * dumped out to the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_SIZES); /* * create a temporary memory context that is used for calling * ginExtractEntries(), and can be reset after each tuple */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_SIZES); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.funcCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * Write the index tuples contained in *collector into the index's * pending list. * * Function guarantees that all these tuples will be inserted consecutively, * preserving order */ void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; int cleanupSize; bool needWal; if (collector->ntuples == 0) return; needWal = RelationNeedsWAL(index); data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* * Total size is greater than one page => make sublist */ separateList = true; } else { LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber || collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) { /* * Pending list is empty or total size is greater than freespace * on tail page => make sublist * * We unlock metabuffer to keep high concurrency */ separateList = true; LockBuffer(metabuffer, GIN_UNLOCK); } } if (separateList) { /* * We should make sublist separately and append it to the tail */ GinMetaPageData sublist; memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); if (needWal) XLogBeginInsert(); /* * metapage was unlocked, see above */ LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); metadata->head = sublist.head; metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { /* * Merge lists */ data.prevTail = metadata->tail; data.newRightlink = sublist.head; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); GinPageGetOpaque(page)->rightlink = sublist.head; MarkBufferDirty(buffer); metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; if (needWal) XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); } } else { /* * Insert into tail page. Metapage is already locked */ OffsetNumber l, off; int i, tupsize; char *ptr; char *collectordata; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); collectordata = ptr = (char *) palloc(collector->sumsize); data.ntuples = collector->ntuples; if (needWal) XLogBeginInsert(); START_CRIT_SECTION(); /* * Increase counter of heap tuples */ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); GinPageGetOpaque(page)->maxoff++; metadata->nPendingHeapTuples++; for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); memcpy(ptr, collector->tuples[i], tupsize); ptr += tupsize; off++; } Assert((ptr - collectordata) <= collector->sumsize); if (needWal) { XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); XLogRegisterBufData(1, collectordata, collector->sumsize); } metadata->tailFreeSize = PageGetExactFreeSpace(page); MarkBufferDirty(buffer); } /* * Write metabuffer, make xlog entry */ MarkBufferDirty(metabuffer); if (needWal) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); } } if (buffer != InvalidBuffer) UnlockReleaseBuffer(buffer); /* * Force pending list cleanup when it becomes too long. And, * ginInsertCleanup could take significant amount of time, so we prefer to * call it when it can do all the work in a single collection cycle. In * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it * while pending list is still small enough to fit into * gin_pending_list_limit. * * ginInsertCleanup() should not be called inside our CRIT_SECTION. */ cleanupSize = GinGetPendingListCleanupSize(index); if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L) needCleanup = true; UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); if (needCleanup) ginInsertCleanup(ginstate, true, NULL); }
/* * Initialize a sequence's relation with the specified tuple as content */ static void fill_seq_with_data(Relation rel, HeapTuple tuple) { Buffer buf; Page page; sequence_magic *sm; OffsetNumber offnum; /* Initialize first page of relation with special magic number */ buf = ReadBuffer(rel, P_NEW); Assert(BufferGetBlockNumber(buf) == 0); page = BufferGetPage(buf); PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); sm = (sequence_magic *) PageGetSpecialPointer(page); sm->magic = SEQ_MAGIC; /* Now insert sequence tuple */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* * Since VACUUM does not process sequences, we have to force the tuple to * have xmin = FrozenTransactionId now. Otherwise it would become * invisible to SELECTs after 2G transactions. It is okay to do this * because if the current transaction aborts, no other xact will ever * examine the sequence tuple anyway. */ HeapTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId); HeapTupleHeaderSetXminFrozen(tuple->t_data); HeapTupleHeaderSetCmin(tuple->t_data, FirstCommandId); HeapTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId); tuple->t_data->t_infomask |= HEAP_XMAX_INVALID; ItemPointerSet(&tuple->t_data->t_ctid, 0, FirstOffsetNumber); /* check the comment above nextval_internal()'s equivalent call. */ if (RelationNeedsWAL(rel)) GetTopTransactionId(); START_CRIT_SECTION(); MarkBufferDirty(buf); offnum = PageAddItem(page, (Item) tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, false); if (offnum != FirstOffsetNumber) elog(ERROR, "failed to add sequence tuple to page"); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_seq_rec xlrec; XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); xlrec.node = rel->rd_node; XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); XLogRegisterData((char *) tuple->t_data, tuple->t_len); recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } END_CRIT_SECTION(); UnlockReleaseBuffer(buf); }
/* * AlterSequence * * Modify the definition of a sequence relation */ ObjectAddress AlterSequence(ParseState *pstate, AlterSeqStmt *stmt) { Oid relid; SeqTable elm; Relation seqrel; Buffer buf; HeapTupleData seqdatatuple; Form_pg_sequence seqform; Form_pg_sequence_data seqdata; FormData_pg_sequence_data newseqdata; List *owned_by; ObjectAddress address; Relation rel; HeapTuple tuple; /* Open and lock sequence. */ relid = RangeVarGetRelid(stmt->sequence, AccessShareLock, stmt->missing_ok); if (relid == InvalidOid) { ereport(NOTICE, (errmsg("relation \"%s\" does not exist, skipping", stmt->sequence->relname))); return InvalidObjectAddress; } init_sequence(relid, &elm, &seqrel); /* allow ALTER to sequence owner only */ if (!pg_class_ownercheck(relid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, stmt->sequence->relname); /* lock page' buffer and read tuple into new sequence structure */ seqdata = read_seq_tuple(seqrel, &buf, &seqdatatuple); /* Copy old values of options into workspace */ memcpy(&newseqdata, seqdata, sizeof(FormData_pg_sequence_data)); rel = heap_open(SequenceRelationId, RowExclusiveLock); tuple = SearchSysCacheCopy1(SEQRELID, ObjectIdGetDatum(relid)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for sequence %u", relid); seqform = (Form_pg_sequence) GETSTRUCT(tuple); /* Check and set new values */ init_params(pstate, stmt->options, false, seqform, &newseqdata, &owned_by); /* Clear local cache so that we don't think we have cached numbers */ /* Note that we do not change the currval() state */ elm->cached = elm->last; /* check the comment above nextval_internal()'s equivalent call. */ if (RelationNeedsWAL(seqrel)) GetTopTransactionId(); /* Now okay to update the on-disk tuple */ START_CRIT_SECTION(); memcpy(seqdata, &newseqdata, sizeof(FormData_pg_sequence_data)); MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(seqrel)) { xl_seq_rec xlrec; XLogRecPtr recptr; Page page = BufferGetPage(buf); XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); xlrec.node = seqrel->rd_node; XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len); recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } END_CRIT_SECTION(); UnlockReleaseBuffer(buf); /* process OWNED BY if given */ if (owned_by) process_owned_by(seqrel, owned_by); InvokeObjectPostAlterHook(RelationRelationId, relid, 0); ObjectAddressSet(address, RelationRelationId, relid); relation_close(seqrel, NoLock); simple_heap_update(rel, &tuple->t_self, tuple); CatalogUpdateIndexes(rel, tuple); heap_close(rel, RowExclusiveLock); return address; }
/* * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write * lock is OK). This lock and pin will be dropped before exiting. * * The "stack" argument can be a search stack leading (approximately) to the * target page, or NULL --- outside callers typically pass NULL since they * have not done such a search, but internal recursion cases pass the stack * to avoid duplicated search effort. * * Returns the number of pages successfully deleted (zero if page cannot * be deleted now; could be more than one if parent pages were deleted too). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, BTStack stack) { int result; BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; IndexTuple targetkey, itup; ScanKey itup_scankey; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); /* * To avoid deadlocks, we'd better drop the target page lock before going * further. */ _bt_relbuf(rel, buf); /* * We need an approximate pointer to the page's parent page. We use the * standard search mechanism to search for the page's high key; this will * give us a link to either the current parent or someplace to its left * (if there are multiple equal high keys). In recursion cases, the * caller already generated a search stack and we can just re-use that * work. */ if (stack == NULL) { if (!InRecovery) { /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did * more than we needed. Locate the stack item pointing to our * parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } } else { /* * During WAL recovery, we can't use _bt_search (for one reason, * it might invoke user-defined comparison functions that expect * facilities not available in recovery mode). Instead, just set * up a dummy stack pointing to the left end of the parent tree * level, from which _bt_getstackbuf will walk right to the parent * page. Painful, but we don't care too much about performance in * this scenario. */ pbuf = _bt_get_endpoint(rel, targetlevel + 1, false); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; /* bts_btentry will be initialized below */ stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } } /* * We cannot delete a page that is the rightmost child of its immediate * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete. We * don't need to re-test when deleting a non-leaf page, though. */ if (targetlevel == 0 && !_bt_parent_deletion_safe(rel, target, stack)) return 0; /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != target) elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", rightsib, opaque->btpo_prev, target, RelationGetRelationName(rel)); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. The "can't delete" case should have been * detected by _bt_parent_deletion_safe, so complain if we see it now. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"", target, parent, RelationGetRelationName(rel)); } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We don't support handling this in the case where the parent is becoming * half-dead, even though it theoretically could occur. * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE && !parent_half_dead) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Check that the parent-page index items we're about to delete/overwrite * contain what we expect. This can fail if the index has become * corrupt for some reason. We want to throw any error before entering * the critical section --- otherwise it'd be a PANIC. * * The test on the target item is just an Assert because _bt_getstackbuf * should have guaranteed it has the expected contents. The test on the * next-child downlink is known to sometimes fail in the field, though. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); #endif if (!parent_half_dead) { OffsetNumber nextoffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), parent, RelationGetRelationName(rel)); } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. Asserts here are just rechecking * things we already verified above. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); } /* Must mark buffers dirty before XLogInsert */ MarkBufferDirty(pbuf); MarkBufferDirty(rbuf); MarkBufferDirty(buf); if (BufferIsValid(lbuf)) MarkBufferDirty(lbuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; xlrec.btpo_xact = opaque->btpo.xact; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else if (parent_half_dead) xlinfo = XLOG_BTREE_DELETE_PAGE_HALF; else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* release metapage; send out relcache inval if metapage changed */ if (BufferIsValid(metabuf)) { CacheInvalidateRelcache(rel); _bt_relbuf(rel, metabuf); } /* can always release leftsib immediately */ if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); /* * If parent became half dead, recurse to delete it. Otherwise, if right * sibling is empty and is now the last child of the parent, recurse to * try to delete it. (These cases cannot apply at the same time, though * the second case might itself recurse to the first.) * * When recursing to parent, we hold the lock on the target page until * done. This delays any insertions into the keyspace that was just * effectively reassigned to the parent's right sibling. If we allowed * that, and there were enough such insertions before we finish deleting * the parent, page splits within that keyspace could lead to inserting * out-of-order keys into the grandparent level. It is thought that that * wouldn't have any serious consequences, but it still seems like a * pretty bad idea. */ if (parent_half_dead) { /* recursive call will release pbuf */ _bt_relbuf(rel, rbuf); result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1; _bt_relbuf(rel, buf); } else if (parent_one_child && rightsib_empty) { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); /* recursive call will release rbuf */ result = _bt_pagedel(rel, rbuf, stack) + 1; } else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); _bt_relbuf(rel, rbuf); result = 1; } return result; }
/* * Creates new posting tree containing the given TIDs. Returns the page * number of the root of the new posting tree. * * items[] must be in sorted order with no duplicates. */ BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, GinStatsData *buildStats) { BlockNumber blkno; Buffer buffer; Page page; int nrootitems; /* Calculate how many TIDs will fit on first page. */ nrootitems = Min(nitems, GinMaxLeafDataItems); /* * Create the root page. */ buffer = GinNewBuffer(index); page = BufferGetPage(buffer); blkno = BufferGetBlockNumber(buffer); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nrootitems); GinPageGetOpaque(page)->maxoff = nrootitems; MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata[2]; ginxlogCreatePostingTree data; data.node = index->rd_node; data.blkno = blkno; data.nitem = nrootitems; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogCreatePostingTree); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; rdata[1].data = (char *) items; rdata[1].len = sizeof(ItemPointerData) * nrootitems; rdata[1].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); PageSetLSN(page, recptr); } UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* During index build, count the newly-added data page */ if (buildStats) buildStats->nDataPages++; /* * Add any remaining TIDs to the newly-created posting tree. */ if (nitems > nrootitems) { ginInsertItemPointers(index, blkno, items + nrootitems, nitems - nrootitems, buildStats); } return blkno; }
/* * _bt_getroot() -- Get the root page of the btree. * * Since the root page can move around the btree file, we have to read * its location from the metadata page, and then read the root page * itself. If no root page exists yet, we have to create one. The * standard class of race conditions exists here; I think I covered * them all in the Hopi Indian rain dance of lock requests below. * * The access type parameter (BT_READ or BT_WRITE) controls whether * a new root page will be created or not. If access = BT_READ, * and no root page exists, we just return InvalidBuffer. For * BT_WRITE, we try to create the root page if it doesn't exist. * NOTE that the returned root page will have only a read lock set * on it even if access = BT_WRITE! * * The returned page is not necessarily the true root --- it could be * a "fast root" (a page that is alone in its level due to deletions). * Also, if the root page is split while we are "in flight" to it, * what we will return is the old root, which is now just the leftmost * page on a probably-not-very-wide level. For most purposes this is * as good as or better than the true root, so we do not bother to * insist on finding the true root. We do, however, guarantee to * return a live (not deleted or half-dead) page. * * On successful return, the root page is pinned and read-locked. * The metadata page is not locked or pinned on exit. */ Buffer _bt_getroot(Relation rel, int access) { Buffer metabuf; Page metapg; BTPageOpaque metaopaque; Buffer rootbuf; Page rootpage; BTPageOpaque rootopaque; BlockNumber rootblkno; uint32 rootlevel; BTMetaPageData *metad; /* * Try to use previously-cached metapage data to find the root. This * normally saves one buffer access per index search, which is a very * helpful savings in bufmgr traffic and hence contention. */ if (rel->rd_amcache != NULL) { metad = (BTMetaPageData *) rel->rd_amcache; /* We shouldn't have cached it if any of these fail */ Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version == BTREE_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); /* * Since the cache might be stale, we check the page more carefully * here than normal. We *must* check that it's not deleted. If it's * not alone on its level, then we reject too --- this may be overly * paranoid but better safe than sorry. Note we don't check P_ISROOT, * because that's not set in a "fast root". */ if (!P_IGNORE(rootopaque) && rootopaque->btpo.level == rootlevel && P_LEFTMOST(rootopaque) && P_RIGHTMOST(rootopaque)) { /* OK, accept cached page as the root */ return rootbuf; } _bt_relbuf(rel, rootbuf); /* Cache is stale, throw it away */ if (rel->rd_amcache) pfree(rel->rd_amcache); rel->rd_amcache = NULL; } metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); metad = BTPageGetMeta(metapg); /* sanity-check the metapage */ if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d, code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) { /* If access = BT_READ, caller doesn't want us to create root yet */ if (access == BT_READ) { _bt_relbuf(rel, metabuf); return InvalidBuffer; } /* trade in our read lock for a write lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we * must avoid doing it again. */ if (metad->btm_root != P_NONE) { /* * Metadata initialized by someone else. In order to guarantee no * deadlocks, we have to release the metadata page and start all * over again. (Is that really true? But it's hardly worth trying * to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* * Get, initialize, write, and leave a lock of the appropriate type on * the new root page. Since this is the first page in the tree, it's * a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); rootopaque->btpo.level = 0; rootopaque->btpo_cycleid = 0; /* NO ELOG(ERROR) till meta is updated */ START_CRIT_SECTION(); metad->btm_root = rootblkno; metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_newroot xlrec; XLogRecPtr recptr; XLogRecData rdata; xlrec.node = rel->rd_node; xlrec.rootblk = rootblkno; xlrec.level = 0; rdata.data = (char *) &xlrec; rdata.len = SizeOfBtreeNewroot; rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata); PageSetLSN(rootpage, recptr); PageSetTLI(rootpage, ThisTimeLineID); PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } END_CRIT_SECTION(); /* * Send out relcache inval for metapage change (probably unnecessary * here, but let's be safe). */ CacheInvalidateRelcache(rel); /* * swap root write lock for read lock. There is no danger of anyone * else accessing the new root page while it's unlocked, since no one * else knows where it is yet. */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); /* okay, metadata is correct, release lock on it */ _bt_relbuf(rel, metabuf); } else { rootblkno = metad->btm_fastroot; Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; /* * Cache the metapage data for next time */ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, sizeof(BTMetaPageData)); memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); /* * We are done with the metapage; arrange to release it via first * _bt_relandgetbuf call */ rootbuf = metabuf; for (;;) { rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); if (!P_IGNORE(rootopaque)) break; /* it's dead, Jim. step right one page */ if (P_RIGHTMOST(rootopaque)) elog(ERROR, "no live root page found in index \"%s\"", RelationGetRelationName(rel)); rootblkno = rootopaque->btpo_next; } /* Note: can't check btpo.level on deleted pages */ if (rootopaque->btpo.level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), rootopaque->btpo.level, rootlevel); } /* * By here, we have a pin and read lock on the root page, and no lock set * on the metadata page. Return the root page's buffer. */ return rootbuf; }
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel) { Page page = BufferGetPage(buf); BTPageOpaque opaque; Assert(nitems > 0); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ PageIndexMultiDelete(page, itemnos, nitems); /* * We can clear the vacuum cycle ID since this page has certainly been * processed by the current vacuum scan. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_cycleid = 0; /* * Mark the page as not containing any LP_DEAD items. This is not * certainly true (there might be some that have recently been marked, but * weren't included in our target-item list), but it will almost always be * true and it doesn't seem worth an additional page scan to check it. * Remember that BTP_HAS_GARBAGE is only a hint anyway. */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; MarkBufferDirty(buf); /* XLOG stuff */ if (!rel->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata[3]; xl_btree_delete xlrec_delete; xlrec_delete.node = rel->rd_node; xlrec_delete.hnode = heapRel->rd_node; xlrec_delete.block = BufferGetBlockNumber(buf); xlrec_delete.nitems = nitems; rdata[0].data = (char *) &xlrec_delete; rdata[0].len = SizeOfBtreeDelete; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); /* * We need the target-offsets array whether or not we store the to * allow us to find the latestRemovedXid on a standby server. */ rdata[1].data = (char *) itemnos; rdata[1].len = nitems * sizeof(OffsetNumber); rdata[1].buffer = InvalidBuffer; rdata[1].next = &(rdata[2]); rdata[2].data = NULL; rdata[2].len = 0; rdata[2].buffer = buf; rdata[2].buffer_std = true; rdata[2].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); }
/* * Delete item(s) from a btree page. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer. * Also, the given itemnos *must* appear in increasing order in the array. * * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby * we need to be able to pin all of the blocks in the btree in physical * order when replaying the effects of a VACUUM, just as we do for the * original VACUUM itself. lastBlockVacuumed allows us to tell whether an * intermediate range of blocks has had no changes at all by VACUUM, * and so must be scanned anyway during replay. We always write a WAL record * for the last block in the index, whether or not it contained any items * to be removed. This allows us to scan right up to end of index to * ensure correct locking. */ void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ if (nitems > 0) PageIndexMultiDelete(page, itemnos, nitems); /* * We can clear the vacuum cycle ID since this page has certainly been * processed by the current vacuum scan. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_cycleid = 0; /* * Mark the page as not containing any LP_DEAD items. This is not * certainly true (there might be some that have recently been marked, but * weren't included in our target-item list), but it will almost always be * true and it doesn't seem worth an additional page scan to check it. * Remember that BTP_HAS_GARBAGE is only a hint anyway. */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; MarkBufferDirty(buf); /* XLOG stuff */ if (!rel->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata[2]; xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.node = rel->rd_node; xlrec_vacuum.block = BufferGetBlockNumber(buf); xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; rdata[0].data = (char *) &xlrec_vacuum; rdata[0].len = SizeOfBtreeVacuum; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); /* * The target-offsets array is not in the buffer, but pretend that it * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ if (nitems > 0) { rdata[1].data = (char *) itemnos; rdata[1].len = nitems * sizeof(OffsetNumber); } else { rdata[1].data = NULL; rdata[1].len = 0; } rdata[1].buffer = buf; rdata[1].buffer_std = true; rdata[1].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); }
/* * Delete a posting tree page. */ static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) { Buffer dBuffer; Buffer lBuffer; Buffer pBuffer; Page page, parentPage; BlockNumber rightlink; /* * Lock the pages in the same order as an insertion would, to avoid * deadlocks: left, then right, then parent. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); /* Unlink the page by changing left sibling's rightlink */ page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do { PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif GinPageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); /* * we shouldn't change rightlink field to save workability of running * search scan */ GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(pBuffer); MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; ginxlogDeletePage data; /* * We can't pass REGBUF_STANDARD for the deleted page, because we * didn't set pd_lower on pre-9.4 versions. The page might've been * binary-upgraded from an older version, and hence not have pd_lower * set correctly. Ditto for the left page, but removing the item from * the parent updated its pd_lower, so we know that's OK at this * point. */ XLogBeginInsert(); XLogRegisterBuffer(0, dBuffer, 0); XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD); XLogRegisterBuffer(2, lBuffer, 0); data.parentOffset = myoff; data.rightLink = GinPageGetOpaque(page)->rightlink; XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE); PageSetLSN(page, recptr); PageSetLSN(parentPage, recptr); PageSetLSN(BufferGetPage(lBuffer), recptr); } if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; }
static int64 nextval_internal(Oid relid) { SeqTable elm; Relation seqrel; Buffer buf; Page page; HeapTuple pgstuple; Form_pg_sequence pgsform; HeapTupleData seqdatatuple; Form_pg_sequence_data seq; int64 incby, maxv, minv, cache, log, fetch, last; int64 result, next, rescnt = 0; bool cycle; bool logit = false; /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); if (pg_class_aclcheck(elm->relid, GetUserId(), ACL_USAGE | ACL_UPDATE) != ACLCHECK_OK) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied for sequence %s", RelationGetRelationName(seqrel)))); /* read-only transactions may only modify temp sequences */ if (!seqrel->rd_islocaltemp) PreventCommandIfReadOnly("nextval()"); /* * Forbid this during parallel operation because, to make it work, the * cooperating backends would need to share the backend-local cached * sequence information. Currently, we don't support that. */ PreventCommandIfParallelMode("nextval()"); if (elm->last != elm->cached) /* some numbers were cached */ { Assert(elm->last_valid); Assert(elm->increment != 0); elm->last += elm->increment; relation_close(seqrel, NoLock); last_used_seq = elm; return elm->last; } pgstuple = SearchSysCache1(SEQRELID, ObjectIdGetDatum(relid)); if (!HeapTupleIsValid(pgstuple)) elog(ERROR, "cache lookup failed for sequence %u", relid); pgsform = (Form_pg_sequence) GETSTRUCT(pgstuple); cycle = pgsform->seqcycle; incby = pgsform->seqincrement; maxv = pgsform->seqmax; minv = pgsform->seqmin; cache = pgsform->seqcache; ReleaseSysCache(pgstuple); /* lock page' buffer and read tuple */ seq = read_seq_tuple(seqrel, &buf, &seqdatatuple); page = BufferGetPage(buf); elm->increment = incby; last = next = result = seq->last_value; fetch = cache; log = seq->log_cnt; if (!seq->is_called) { rescnt++; /* return last_value if not is_called */ fetch--; } /* * Decide whether we should emit a WAL log record. If so, force up the * fetch count to grab SEQ_LOG_VALS more values than we actually need to * cache. (These will then be usable without logging.) * * If this is the first nextval after a checkpoint, we must force a new * WAL record to be written anyway, else replay starting from the * checkpoint would fail to advance the sequence past the logged values. * In this case we may as well fetch extra values. */ if (log < fetch || !seq->is_called) { /* forced log to satisfy local demand for values */ fetch = log = fetch + SEQ_LOG_VALS; logit = true; } else { XLogRecPtr redoptr = GetRedoRecPtr(); if (PageGetLSN(page) <= redoptr) { /* last update of seq was before checkpoint */ fetch = log = fetch + SEQ_LOG_VALS; logit = true; } } while (fetch) /* try to fetch cache [+ log ] numbers */ { /* * Check MAXVALUE for ascending sequences and MINVALUE for descending * sequences */ if (incby > 0) { /* ascending sequence */ if ((maxv >= 0 && next > maxv - incby) || (maxv < 0 && next + incby > maxv)) { if (rescnt > 0) break; /* stop fetching */ if (!cycle) { char buf[100]; snprintf(buf, sizeof(buf), INT64_FORMAT, maxv); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("nextval: reached maximum value of sequence \"%s\" (%s)", RelationGetRelationName(seqrel), buf))); } next = minv; } else next += incby; } else { /* descending sequence */ if ((minv < 0 && next < minv - incby) || (minv >= 0 && next + incby < minv)) { if (rescnt > 0) break; /* stop fetching */ if (!cycle) { char buf[100]; snprintf(buf, sizeof(buf), INT64_FORMAT, minv); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("nextval: reached minimum value of sequence \"%s\" (%s)", RelationGetRelationName(seqrel), buf))); } next = maxv; } else next += incby; } fetch--; if (rescnt < cache) { log--; rescnt++; last = next; if (rescnt == 1) /* if it's first result - */ result = next; /* it's what to return */ } } log -= fetch; /* adjust for any unfetched numbers */ Assert(log >= 0); /* save info in local cache */ elm->last = result; /* last returned number */ elm->cached = last; /* last fetched number */ elm->last_valid = true; last_used_seq = elm; /* * If something needs to be WAL logged, acquire an xid, so this * transaction's commit will trigger a WAL flush and wait for syncrep. * It's sufficient to ensure the toplevel transaction has an xid, no need * to assign xids subxacts, that'll already trigger an appropriate wait. * (Have to do that here, so we're outside the critical section) */ if (logit && RelationNeedsWAL(seqrel)) GetTopTransactionId(); /* ready to change the on-disk (or really, in-buffer) tuple */ START_CRIT_SECTION(); /* * We must mark the buffer dirty before doing XLogInsert(); see notes in * SyncOneBuffer(). However, we don't apply the desired changes just yet. * This looks like a violation of the buffer update protocol, but it is in * fact safe because we hold exclusive lock on the buffer. Any other * process, including a checkpoint, that tries to examine the buffer * contents will block until we release the lock, and then will see the * final state that we install below. */ MarkBufferDirty(buf); /* XLOG stuff */ if (logit && RelationNeedsWAL(seqrel)) { xl_seq_rec xlrec; XLogRecPtr recptr; /* * We don't log the current state of the tuple, but rather the state * as it would appear after "log" more fetches. This lets us skip * that many future WAL records, at the cost that we lose those * sequence values if we crash. */ XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); /* set values that will be saved in xlog */ seq->last_value = next; seq->is_called = true; seq->log_cnt = 0; xlrec.node = seqrel->rd_node; XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len); recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } /* Now update sequence tuple to the intended final state */ seq->last_value = last; /* last fetched number */ seq->is_called = true; seq->log_cnt = log; /* how much is logged */ END_CRIT_SECTION(); UnlockReleaseBuffer(buf); relation_close(seqrel, NoLock); return result; }
IndexBulkDeleteResult * ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "Gin vacuum temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; gvs.strategy = info->strategy; initGinState(&gvs.ginstate, index); /* first time through? */ if (stats == NULL) { /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* * and cleanup any pending inserts */ ginInsertCleanup(&gvs.ginstate, !IsAutoVacuumWorkerProcess(), false, stats); } /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.result = stats; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinGetDownlink(itup); Assert(blkno != InvalidBlockNumber); UnlockReleaseBuffer(buffer); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_EXCLUSIVE); } MemoryContextDelete(gvs.tmpCxt); return gvs.result; }
/* * Main internal procedure that handles 2 & 3 arg forms of SETVAL. * * Note that the 3 arg version (which sets the is_called flag) is * only for use in pg_dump, and setting the is_called flag may not * work if multiple users are attached to the database and referencing * the sequence (unlikely if pg_dump is restoring it). * * It is necessary to have the 3 arg version so that pg_dump can * restore the state of a sequence exactly during data-only restores - * it is the only way to clear the is_called flag in an existing * sequence. */ static void do_setval(Oid relid, int64 next, bool iscalled) { SeqTable elm; Relation seqrel; Buffer buf; HeapTupleData seqdatatuple; Form_pg_sequence_data seq; HeapTuple pgstuple; Form_pg_sequence pgsform; int64 maxv, minv; /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); if (pg_class_aclcheck(elm->relid, GetUserId(), ACL_UPDATE) != ACLCHECK_OK) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied for sequence %s", RelationGetRelationName(seqrel)))); pgstuple = SearchSysCache1(SEQRELID, ObjectIdGetDatum(relid)); if (!HeapTupleIsValid(pgstuple)) elog(ERROR, "cache lookup failed for sequence %u", relid); pgsform = (Form_pg_sequence) GETSTRUCT(pgstuple); maxv = pgsform->seqmax; minv = pgsform->seqmin; ReleaseSysCache(pgstuple); /* read-only transactions may only modify temp sequences */ if (!seqrel->rd_islocaltemp) PreventCommandIfReadOnly("setval()"); /* * Forbid this during parallel operation because, to make it work, the * cooperating backends would need to share the backend-local cached * sequence information. Currently, we don't support that. */ PreventCommandIfParallelMode("setval()"); /* lock page' buffer and read tuple */ seq = read_seq_tuple(seqrel, &buf, &seqdatatuple); if ((next < minv) || (next > maxv)) { char bufv[100], bufm[100], bufx[100]; snprintf(bufv, sizeof(bufv), INT64_FORMAT, next); snprintf(bufm, sizeof(bufm), INT64_FORMAT, minv); snprintf(bufx, sizeof(bufx), INT64_FORMAT, maxv); ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("setval: value %s is out of bounds for sequence \"%s\" (%s..%s)", bufv, RelationGetRelationName(seqrel), bufm, bufx))); } /* Set the currval() state only if iscalled = true */ if (iscalled) { elm->last = next; /* last returned number */ elm->last_valid = true; } /* In any case, forget any future cached numbers */ elm->cached = elm->last; /* check the comment above nextval_internal()'s equivalent call. */ if (RelationNeedsWAL(seqrel)) GetTopTransactionId(); /* ready to change the on-disk (or really, in-buffer) tuple */ START_CRIT_SECTION(); seq->last_value = next; /* last fetched number */ seq->is_called = iscalled; seq->log_cnt = 0; MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(seqrel)) { xl_seq_rec xlrec; XLogRecPtr recptr; Page page = BufferGetPage(buf); XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); xlrec.node = seqrel->rd_node; XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len); recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } END_CRIT_SECTION(); UnlockReleaseBuffer(buf); relation_close(seqrel, NoLock); }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; XLogRecData rdata[1]; Buffer buffers[GIN_NDELETE_AT_ONCE]; data.node = index->rd_node; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogDeleteListPages); rdata[0].next = NULL; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { data.toDelete[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); } while (blknoToDelete != newHead); return false; }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling * this function. Except in recovery, caller should also pass the heap * buffer. When checksums are enabled and we're not in recovery, we must add * the heap buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); /* * If data checksums are enabled, we need to protect the heap * page from being torn. */ if (DataChecksumsEnabled()) { Page heapPage = BufferGetPage(heapBuf); /* caller is expected to set PD_ALL_VISIBLE first */ Assert(PageIsAllVisible(heapPage)); PageSetLSN(heapPage, recptr); } } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); return false; }
/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if it cannot get the needed locks. * * The caller should hold no locks on the hash index. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */ void _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; uint32 highmask; uint32 lowmask; /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in * _hash_metapinit(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. * * The lock protects us against other backends, but not against our own * backend. Must check for active scans separately. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); if (_hash_has_active_scan(rel, old_bucket)) goto fail; if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) goto fail; /* * Likewise lock the new bucket (should never fail). * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); if (_hash_has_active_scan(rel, new_bucket)) elog(ERROR, "scan in progress on supposedly new bucket"); if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) elog(ERROR, "could not get lock on supposedly new bucket"); /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to allocate a new batch of bucket pages. */ spare_ndx = _hash_log2(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * The number of buckets in the new splitpoint is equal to the total * number already in existence, i.e. new_bucket. Currently this maps * one-to-one to blocks required, but someday we may need a more * complicated calculation here. */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { /* can't split due to BlockNumber overflow */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); goto fail; } } /* * Okay to proceed with split. Update the metapage bucket mapping info. * * Since we are scribbling on the metapage data right in the shared * buffer, any failure in this next little bit leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. We don't really expect any failure, but just to be sure, * establish a critical section. */ START_CRIT_SECTION(); metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; } /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to adjust the hashm_spares[] array and * hashm_ovflpoint so that future overflow pages will be created beyond * this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; } /* Done mucking with metapage */ END_CRIT_SECTION(); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, start_oblkno, start_nblkno, maxbucket, highmask, lowmask); /* Release bucket locks, allowing others to access them */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); return; /* Here if decide not to split or fail to acquire old bucket lock */ fail: /* We didn't write the metapage, so just drop lock */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ Datum gistbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(heap)); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); gistFreeBuildBuffers(buildstate.gfbb); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * Vacuum a regular (non-root) leaf page * * We must delete tuples that are targeted for deletion by the VACUUM, * but not move any tuples that are referenced by outside links; we assume * those are the ones that are heads of chains. * * If we find a REDIRECT that was made by a concurrently-running transaction, * we must add its target TID to pendingList. (We don't try to visit the * target immediately, first because we don't want VACUUM locking more than * one buffer at a time, and second because the duplicate-filtering logic * in spgAddPendingTID is useful to ensure we can't get caught in an infinite * loop in the face of continuous concurrent insertions.) * * If forPending is true, we are examining the page as a consequence of * chasing a redirect link, not as part of the normal sequential scan. * We still vacuum the page normally, but we don't increment the stats * about live tuples; else we'd double-count those tuples, since the page * has been or will be visited in the sequential scan as well. */ static void vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, bool forPending) { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; OffsetNumber toDead[MaxIndexTuplesPerPage]; OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber moveSrc[MaxIndexTuplesPerPage]; OffsetNumber moveDest[MaxIndexTuplesPerPage]; OffsetNumber chainSrc[MaxIndexTuplesPerPage]; OffsetNumber chainDest[MaxIndexTuplesPerPage]; OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; bool deletable[MaxIndexTuplesPerPage + 1]; int nDeletable; OffsetNumber i, max = PageGetMaxOffsetNumber(page); memset(predecessor, 0, sizeof(predecessor)); memset(deletable, 0, sizeof(deletable)); nDeletable = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; deletable[i] = true; nDeletable++; } else { if (!forPending) bds->stats->num_index_tuples += 1; } /* Form predecessor map, too */ if (lt->nextOffset != InvalidOffsetNumber) { /* paranoia about corrupted chain links */ if (lt->nextOffset < FirstOffsetNumber || lt->nextOffset > max || predecessor[lt->nextOffset] != InvalidOffsetNumber) elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", BufferGetBlockNumber(buffer), RelationGetRelationName(index)); predecessor[lt->nextOffset] = i; } } else if (lt->tupstate == SPGIST_REDIRECT) { SpGistDeadTuple dt = (SpGistDeadTuple) lt; Assert(dt->nextOffset == InvalidOffsetNumber); Assert(ItemPointerIsValid(&dt->pointer)); /* * Add target TID to pending list if the redirection could have * happened since VACUUM started. * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but tqual.c doesn't * currently export a suitable API, and it's not entirely clear * that a tighter test is worth the cycles anyway. */ if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else { Assert(lt->nextOffset == InvalidOffsetNumber); } } if (nDeletable == 0) return; /* nothing more to do */ /*---------- * Figure out exactly what we have to do. We do this separately from * actually modifying the page, mainly so that we have a representation * that can be dumped into WAL and then the replay code can do exactly * the same thing. The output of this step consists of six arrays * describing four kinds of operations, to be performed in this order: * * toDead[]: tuple numbers to be replaced with DEAD tuples * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples * moveSrc[]: tuple numbers that need to be relocated to another offset * (replacing the tuple there) and then replaced with PLACEHOLDER tuples * moveDest[]: new locations for moveSrc tuples * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates * chainDest[]: new values of nextOffset for chainSrc members * * It's easiest to figure out what we have to do by processing tuple * chains, so we iterate over all the tuples (not just the deletable * ones!) to identify chain heads, then chase down each chain and make * work item entries for deletable tuples within the chain. *---------- */ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple head; bool interveningDeletable; OffsetNumber prevLive; OffsetNumber j; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (head->tupstate != SPGIST_LIVE) continue; /* can't be a chain member */ if (predecessor[i] != 0) continue; /* not a chain head */ /* initialize ... */ interveningDeletable = false; prevLive = deletable[i] ? InvalidOffsetNumber : i; /* scan down the chain ... */ j = head->nextOffset; while (j != InvalidOffsetNumber) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, j)); if (lt->tupstate != SPGIST_LIVE) { /* all tuples in chain should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } if (deletable[j]) { /* This tuple should be replaced by a placeholder */ toPlaceholder[xlrec.nPlaceholder] = j; xlrec.nPlaceholder++; /* previous live tuple's chain link will need an update */ interveningDeletable = true; } else if (prevLive == InvalidOffsetNumber) { /* * This is the first live tuple in the chain. It has to move * to the head position. */ moveSrc[xlrec.nMove] = j; moveDest[xlrec.nMove] = i; xlrec.nMove++; /* Chain updates will be applied after the move */ prevLive = i; interveningDeletable = false; } else { /* * Second or later live tuple. Arrange to re-chain it to the * previous live one, if there was a gap. */ if (interveningDeletable) { chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = j; xlrec.nChain++; } prevLive = j; interveningDeletable = false; } j = lt->nextOffset; } if (prevLive == InvalidOffsetNumber) { /* The chain is entirely removable, so we need a DEAD tuple */ toDead[xlrec.nDead] = i; xlrec.nDead++; } else if (interveningDeletable) { /* One or more deletions at end of chain, so close it off */ chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = InvalidOffsetNumber; xlrec.nChain++; } } /* sanity check ... */ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) elog(ERROR, "inconsistent counts of deletable tuples"); /* Do the updates */ START_CRIT_SECTION(); spgPageIndexMultiDelete(&bds->spgstate, page, toDead, xlrec.nDead, SPGIST_DEAD, SPGIST_DEAD, InvalidBlockNumber, InvalidOffsetNumber); spgPageIndexMultiDelete(&bds->spgstate, page, toPlaceholder, xlrec.nPlaceholder, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* * We implement the move step by swapping the item pointers of the source * and target tuples, then replacing the newly-source tuples with * placeholders. This is perhaps unduly friendly with the page data * representation, but it's fast and doesn't risk page overflow when a * tuple to be relocated is large. */ for (i = 0; i < xlrec.nMove; i++) { ItemId idSrc = PageGetItemId(page, moveSrc[i]); ItemId idDest = PageGetItemId(page, moveDest[i]); ItemIdData tmp; tmp = *idSrc; *idSrc = *idDest; *idDest = tmp; } spgPageIndexMultiDelete(&bds->spgstate, page, moveSrc, xlrec.nMove, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); for (i = 0; i < xlrec.nChain; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, chainSrc[i])); Assert(lt->tupstate == SPGIST_LIVE); lt->nextOffset = chainDest[i]; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Permanently drop the currently acquired replication slot which will be * released by the point this function returns. */ static void ReplicationSlotDropAcquired(void) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; ReplicationSlot *slot = MyReplicationSlot; Assert(MyReplicationSlot != NULL); /* slot isn't acquired anymore */ MyReplicationSlot = NULL; /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive before bailing out. If we're dropping an ephemeral slot, * we better never fail hard as the caller won't expect the slot to * survive and this might get called during error handling. */ if (rename(path, tmppath) == 0) { /* * We need to fsync() the directory we just renamed and its parent to * make sure that our changes are on disk in a crash-safe fashion. If * fsync() fails, we can't be sure whether the changes are on disk or * not. For now, we handle that by panicking; * StartupReplicationSlots() will try to straighten it out after * restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); } else { volatile ReplicationSlot *vslot = slot; bool fail_softly = slot->data.persistency == RS_EPHEMERAL; SpinLockAcquire(&slot->mutex); vslot->active_pid = 0; SpinLockRelease(&slot->mutex); ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, tmppath))); } /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active flag here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active_pid = 0; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new___ slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Vacuum a root page when it is also a leaf * * On the root, we just delete any dead leaf tuples; no fancy business */ static void vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); spgxlogVacuumRoot xlrec; OffsetNumber toDelete[MaxIndexTuplesPerPage]; OffsetNumber i, max = PageGetMaxOffsetNumber(page); xlrec.nDelete = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; xlrec.nDelete++; } else { bds->stats->num_index_tuples += 1; } } else { /* all tuples on root should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } } if (xlrec.nDelete == 0) return; /* nothing more to do */ /* Do the update */ START_CRIT_SECTION(); /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ PageIndexMultiDelete(page, toDelete, xlrec.nDelete); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); /* Prepare WAL record */ STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDelete, sizeof(OffsetNumber) * xlrec.nDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Insert value (stored in GinBtree) to tree described by stack * * During an index build, buildStats is non-null and the counters * it contains should be incremented as needed. * * NB: the passed-in stack is freed, as though by freeGinBtreeStack. */ void ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { GinBtreeStack *parent; BlockNumber rootBlkno; Page page, rpage, lpage; /* extract root BlockNumber from stack */ Assert(stack != NULL); parent = stack; while (parent->parent) parent = parent->parent; rootBlkno = parent->blkno; Assert(BlockNumberIsValid(rootBlkno)); /* this loop crawls up the stack until the insertion is complete */ for (;;) { XLogRecData *rdata; BlockNumber savedRightLink; page = BufferGetPage(stack->buffer); savedRightLink = GinPageGetOpaque(page)->rightlink; if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) { START_CRIT_SECTION(); btree->placeToPage(btree, stack->buffer, stack->off, &rdata); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); } LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); return; } else { Buffer rbuffer = GinNewBuffer(btree->index); Page newlpage; /* * newlpage is a pointer to memory page, it doesn't associate with * buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; /* During index build, count the newly-split page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } parent = stack->parent; if (parent == NULL) { /* * split root, so we need to allocate new left page and place * pointer on root to left and right page */ Buffer lbuffer = GinNewBuffer(btree->index); ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE; ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber; page = BufferGetPage(stack->buffer); lpage = BufferGetPage(lbuffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer); START_CRIT_SECTION(); GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF); PageRestoreTempPage(newlpage, lpage); btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer); MarkBufferDirty(rbuffer); MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetLSN(lpage, recptr); PageSetLSN(rpage, recptr); } UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); LockBuffer(stack->buffer, GIN_UNLOCK); END_CRIT_SECTION(); freeGinBtreeStack(stack); /* During index build, count the newly-added root page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } return; } else { /* split non-root page */ ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE; ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink; lpage = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); GinPageGetOpaque(rpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); START_CRIT_SECTION(); PageRestoreTempPage(newlpage, lpage); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(lpage, recptr); PageSetLSN(rpage, recptr); } UnlockReleaseBuffer(rbuffer); END_CRIT_SECTION(); } } btree->isDelete = FALSE; /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; LockBuffer(parent->buffer, GIN_UNLOCK); if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ ginFindParents(btree, stack, rootBlkno); parent = stack->parent; Assert(parent != NULL); break; } parent->blkno = rightlink; parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); LockBuffer(parent->buffer, GIN_EXCLUSIVE); page = BufferGetPage(parent->buffer); } UnlockReleaseBuffer(stack->buffer); pfree(stack); stack = parent; } }
/* * Clean up redirect and placeholder tuples on the given page * * Redirect tuples can be marked placeholder once they're old enough. * Placeholder tuples can be removed if it won't change the offsets of * non-placeholder ones. * * Unlike the routines above, this works on both leaf and inner pages. */ static void vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); OffsetNumber i, max = PageGetMaxOffsetNumber(page), firstPlaceholder = InvalidOffsetNumber; bool hasNonPlaceholder = false; bool hasUpdate = false; OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; START_CRIT_SECTION(); /* * Scan backwards to convert old redirection tuples to placeholder tuples, * and identify location of last non-placeholder tuple while at it. */ for (i = max; i >= FirstOffsetNumber && (opaque->nRedirection > 0 || !hasNonPlaceholder); i--) { SpGistDeadTuple dt; dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); opaque->nRedirection--; opaque->nPlaceholder++; /* remember newest XID among the removed redirects */ if (!TransactionIdIsValid(xlrec.newestRedirectXid) || TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid)) xlrec.newestRedirectXid = dt->xid; ItemPointerSetInvalid(&dt->pointer); itemToPlaceholder[xlrec.nToPlaceholder] = i; xlrec.nToPlaceholder++; hasUpdate = true; } if (dt->tupstate == SPGIST_PLACEHOLDER) { if (!hasNonPlaceholder) firstPlaceholder = i; } else { hasNonPlaceholder = true; } } /* * Any placeholder tuples at the end of page can safely be removed. We * can't remove ones before the last non-placeholder, though, because we * can't alter the offset numbers of non-placeholder tuples. */ if (firstPlaceholder != InvalidOffsetNumber) { /* * We do not store this array to rdata because it's easy to recreate. */ for (i = firstPlaceholder; i <= max; i++) itemnos[i - firstPlaceholder] = i; i = max - firstPlaceholder + 1; Assert(opaque->nPlaceholder >= i); opaque->nPlaceholder -= i; /* The array is surely sorted, so can use PageIndexMultiDelete */ PageIndexMultiDelete(page, itemnos, i); hasUpdate = true; } xlrec.firstPlaceholder = firstPlaceholder; if (hasUpdate) MarkBufferDirty(buffer); if (hasUpdate && RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); XLogRegisterData((char *) itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Prune and repair fragmentation in the specified page. * * Caller must have pin and buffer cleanup lock on the page. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * * Returns the number of tuples deleted from the page and sets * latestRemovedXid. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; Page page = BufferGetPage(buffer); OffsetNumber offnum, maxoff; PruneState prstate; /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much * logic as possible out of the critical section, and also ensures that * WAL replay will work the same as the normal case. * * First, initialize the new pd_prune_xid value to zero (indicating no * prunable tuples). If we find any tuples which may soon become * prunable, we will save the lowest relevant XID in new_prune_xid. Also * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; /* Ignore items already processed as part of an earlier chain */ if (prstate.marked[offnum]) continue; /* Nothing to do if slot is empty or already dead */ itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) continue; /* Process this item or chain of items */ ndeleted += heap_prune_chain(relation, buffer, offnum, OldestXmin, &prstate); } /* Any error while applying the changes is critical */ START_CRIT_SECTION(); /* Have we found any prunable items? */ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) { /* * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* * Also clear the "page is full" flag, since there's no point in * repeating the prune/defrag process until something else happens to * the page. */ PageClearFull(page); MarkBufferDirty(buffer); /* * Emit a WAL HEAP_CLEAN record showing what we did */ if (RelationNeedsWAL(relation)) { XLogRecPtr recptr; recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, prstate.latestRemovedXid); PageSetLSN(BufferGetPage(buffer), recptr); PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); } } else { /* * If we didn't prune anything, but have found a new value for the * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * * Also clear the "page is full" flag if it is set, since there's no * point in repeating the prune/defrag process until something else * happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); SetBufferCommitInfoNeedsSave(buffer); } } END_CRIT_SECTION(); /* * If requested, report the number of tuples reclaimed to pgstats. This is * ndeleted minus ndead, because we don't want to count a now-DEAD root * item as a deletion for this purpose. */ if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); *latestRemovedXid = prstate.latestRemovedXid; /* * XXX Should we update the FSM information of this page ? * * There are two schools of thought here. We may not want to update FSM * information so that the page is not used for unrelated UPDATEs/INSERTs * and any free space in this page will remain available for further * UPDATEs in *this* page, thus improving chances for doing HOT updates. * * But for a large table and where a page does not receive further UPDATEs * for a long time, we might waste this space by not updating the FSM * information. The relation may get extended and fragmented further. * * One possibility is to leave "fillfactor" worth of space in this page * and update FSM with the remaining space. * * In any case, the current FSM implementation doesn't accept * one-page-at-a-time updates, so this is all academic for now. */ return ndeleted; }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright, Relation heapRel, bool is_build) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); /* * If leaf page is full, try at first to delete dead tuples. And then * check again. */ if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) { gistprunepage(rel, page, buffer, heapRel); is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); } if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; int npage; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Check that split didn't produce too many pages. */ npage = 0; for (ptr = dist; ptr; ptr = ptr->next) npage++; /* in a root split, we'll add one more page to the list below */ if (is_rootsplit) npage++; if (npage > GIST_MAX_SPLIT_PAGES) elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", npage, GIST_MAX_SPLIT_PAGES); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); PredicateLockPageSplit(rel, BufferGetBlockNumber(buffer), BufferGetBlockNumber(ptr->buffer)); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } /* * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL * insertion for that. NB: The number of pages and data segments * specified here must match the calculations in gistXLogSplit()! */ if (!is_build && RelationNeedsWAL(rel)) XLogEnsureRecordSpace(npage, 1 + npage * 2); START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); /* * Write the WAL record. * * If we're building a new index, however, we don't WAL-log changes * yet. The LSN-NSN interlock between parent and child requires that * LSNs never move backwards, so set the LSNs to a value that's * smaller than any real or fake unlogged LSN that might be generated * later. (There can't be any concurrent scans during index build, so * we don't need to be able to detect concurrent splits yet.) */ if (is_build) recptr = GistBuildLSN; else { if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); } for (ptr = dist; ptr; ptr = ptr->next) PageSetLSN(ptr->page, recptr); /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We always get here if ntup==0. */ START_CRIT_SECTION(); /* * Delete old tuple if any, then insert new tuple(s) if any. If * possible, use the fast path of PageIndexTupleOverwrite. */ if (OffsetNumberIsValid(oldoffnum)) { if (ntup == 1) { /* One-for-one replacement, so use PageIndexTupleOverwrite */ if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, IndexTupleSize(*itup))) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); } else { /* Delete old, then append new tuple(s) to page */ PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); } } else { /* Just append new tuples at the end of the page */ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); } MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (is_build) recptr = GistBuildLSN; else { if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); } else recptr = gistGetFakeLSN(rel); } PageSetLSN(page, recptr); if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }